Exemplo n.º 1
0
def run_predict(model_name, path_model, path_data, path_output, n_sample=-1):
    path_output = root + path_output
    path_data = root + path_data + "/features.zip"  #.zip
    path_model = root + path_model
    path_pipeline = path_model + "/pipeline/"
    path_test_X = path_data + "/features.zip"  #.zip #added path to testing features
    log(path_data, path_model, path_output)

    colid = load(f'{path_pipeline}/colid.pkl')

    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)

    dfX, cols_family = preprocess(df, path_pipeline)

    ypred, yproba = predict(model_name, path_model, dfX, cols_family)

    log("Saving prediction", ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df[cols_family["coly"] + "_pred"] = ypred
    if yproba is not None:
        df[cols_family["coly"] + "_pred_proba"] = yproba
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    #####  Export Specific
    df[cols_family["coly"]] = ypred
    df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
Exemplo n.º 2
0
def run_predict(config_name,
                config_path,
                n_sample=-1,
                path_data=None,
                path_output=None,
                pars={},
                model_dict=None):

    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    m = model_dict['global_pars']

    model_class = model_dict['model_pars']['model_class']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']

    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    ##########################################################################################
    colid = load(f'{path_pipeline}/colid.pkl')
    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)

    from run_preprocess import preprocess_inference as preprocess
    dfX, cols_family = preprocess(df, path_pipeline, preprocess_pars=pars)
    ypred, yproba = predict(model_class, path_model, dfX, cols_family)

    log("############ Saving prediction  ###################################################"
        )
    log(ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df[cols_family["coly"] + "_pred"] = ypred
    if yproba is not None:
        df[cols_family["coly"] + "_pred_proba"] = yproba
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    log("###########  Export Specific ######################################################"
        )
    df[cols_family["coly"]] = ypred
    df[[cols_family["coly"]]].to_csv(f"{path_output}/pred_only.csv")
Exemplo n.º 3
0
def run_data_check(path_data,
                   path_data_ref,
                   path_model,
                   path_output,
                   sample_ratio=0.5):
    """
     Calcualata Dataset Shift before prediction.
    """
    from run_preprocess import preprocess_inference as preprocess
    path_output = root + path_output
    path_data = root + path_data
    path_data_ref = root + path_data_ref
    path_pipeline = root + path_model + "/pipeline/"

    os.makedirs(path_output, exist_ok=True)
    colid = load(f'{path_pipeline}/colid.pkl')

    df1 = load_dataset(path_data_ref, colid=colid)
    dfX1, cols_family1 = preprocess(df1, path_pipeline)

    df2 = load_dataset(path_data, colid=colid)
    dfX2, cols_family2 = preprocess(df2, path_pipeline)

    colsX = cols_family1["colnum_bin"] + cols_family1["colcat_bin"]
    dfX1 = dfX1[colsX]
    dfX2 = dfX2[colsX]

    from util_feature import pd_stat_dataset_shift
    nsample = int(min(len(dfX1), len(dfX2)) * sample_ratio)
    metrics_psi = pd_stat_dataset_shift(dfX2,
                                        dfX1,
                                        colsX,
                                        nsample=nsample,
                                        buckets=7,
                                        axis=0)
    metrics_psi.to_csv(f"{path_output}/prediction_features_metrics.csv")
    log(metrics_psi)
Exemplo n.º 4
0
def preprocess(path_train_X="",
               path_train_y="",
               path_pipeline_export="",
               cols_group=None,
               n_sample=5000,
               preprocess_pars={},
               path_features_store=None):
    """
      Used for trainiing only
      Save params on disk

    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param path_features_store:
    :return:
    """
    ##### column names for feature generation #####################################################
    log(cols_group)
    coly = cols_group['coly']  # 'salary'
    colid = cols_group['colid']  # "jobId"
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    save(colid, f'{path_pipeline_export}/colid.pkl')

    ### Pipeline Execution ##########################################
    pipe_default = [{
        'uri': 'source/prepro.py::pd_coly',
        'pars': {},
        'cols_family': 'coly',
        'type': 'coly'
    }, {
        'uri': 'source/prepro.py::pd_colnum_bin',
        'pars': {},
        'cols_family': 'colnum',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colnum_binto_onehot',
        'pars': {},
        'cols_family': 'colnum_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_bin',
        'pars': {},
        'cols_family': 'colcat',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcat_to_onehot',
        'pars': {},
        'cols_family': 'colcat_bin',
        'type': ''
    }, {
        'uri': 'source/prepro.py::pd_colcross',
        'pars': {},
        'cols_family': 'colcross',
        'type': 'cross'
    }]

    pipe_list = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list_X = [
        task for task in pipe_list
        if task.get('type', '') not in ['coly', 'filter']
    ]
    pipe_list_y = [
        task for task in pipe_list if task.get('type', '') in ['coly']
    ]
    pipe_filter = [
        task for task in pipe_list if task.get('type', '') in ['filter']
    ]
    ##### Load data #################################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample)

    ##### Generate features ##########################################################################
    dfi_all = {}  ### Dict of all features
    cols_family_all = {'colid': colid, 'colnum': colnum, 'colcat': colcat}

    if len(pipe_filter) > 0:
        log("#####  Filter  #########################################################################"
            )
        pipe_i = pipe_filter[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        df, col_pars = pipe_fun(df,
                                list(df.columns),
                                pars=pipe_i.get('pars', {}))

    if len(pipe_list_y) > 0:
        log("#####  coly  ###########################################################################"
            )
        pipe_i = pipe_list_y[0]
        pipe_fun = load_function_uri(pipe_i['uri'])
        logs("----------df----------\n", df)
        pars = pipe_i.get('pars', {})
        pars['path_features_store'] = path_features_store
        pars['path_pipeline_export'] = path_pipeline_export
        df, col_pars = pipe_fun(df, cols_group['coly'],
                                pars=pars)  ### coly can remove rows

        logs("----------df----------\n", df)
        dfi_all['coly'] = df[cols_group['coly']]
        cols_family_all['coly'] = cols_group['coly']
        save_features(df[cols_group['coly']], "coly",
                      path_features_store)  ### already saved
        save(coly, f'{path_pipeline_export}/coly.pkl')

    #####  Processors  ###############################################################################
    dfi_all['coly'] = df[cols_group['coly']]
    #for colg, colg_list in cols_group.items() :
    #   if colg not in  ['colid']:
    #      dfi_all[colg]   = df[colg_list]   ## colnum colcat, coly

    for pipe_i in pipe_list_X:
        log("###################", pipe_i,
            "##########################################################")
        pipe_fun = load_function_uri(
            pipe_i['uri'])  ### Load the code definition  into pipe_fun
        cols_name = pipe_i['cols_family']
        col_type = pipe_i['type']

        pars = pipe_i.get('pars', {})
        pars[
            'path_features_store'] = path_features_store  ### intermdiate dataframe
        pars['path_pipeline_export'] = path_pipeline_export  ### Store pipeline

        if col_type == 'cross':
            log("###################  Adding Cross ###################################################"
                )
            pars['dfnum_hot'] = dfi_all[
                'colnum_onehot']  ### dfnum_hot --> dfcross
            pars['dfcat_hot'] = dfi_all['colcat_onehot']
            pars['colid'] = colid
            pars['colcross_single'] = cols_group.get('colcross', [])

        elif col_type == 'add_coly':
            log('add_coly genetic', cols_group['coly'])
            pars['coly'] = cols_group['coly']
            pars['dfy'] = dfi_all['coly']  ### Transformed dfy

        ### Input columns or prevously Computed Columns ( colnum_bin )
        cols_list = cols_group[cols_name] if cols_name in cols_group else list(
            dfi_all[cols_name].columns)
        df_ = df[cols_list] if cols_name in cols_group else dfi_all[cols_name]
        #cols_list  = list(dfi_all[cols_name].columns)
        #df_        = dfi_all[cols_name]

        dfi, col_pars = pipe_fun(df_, cols_list, pars=pars)

        ### Concatenate colnum, colnum_bin into cols_family_all , dfi_all  ###########################
        for colj, colist in col_pars['cols_new'].items():
            ### Merge sub-family
            cols_family_all[colj] = cols_family_all.get(colj, []) + colist
            dfi_all[colj] = pd.concat(
                (dfi_all[colj], dfi), axis=1) if colj in dfi_all else dfi
            # save_features(dfi_all[colj], colj, path_features_store)

    ######  Merge AlL int dfXy  ##################################################################
    dfXy = df[[coly] + colnum + colcat]
    #dfXy = df[ [coly]  ]

    for t in dfi_all.keys():
        if t not in ['coly', 'colnum', 'colcat']:
            dfXy = pd.concat((dfXy, dfi_all[t]), axis=1)
    save_features(dfXy, 'dfX', path_features_store)

    colXy = list(dfXy.columns)
    colXy.remove(coly)  ##### Only X columns
    if len(colid) > 0:
        cols_family_all['colid'] = colid
    cols_family_all['colX'] = colXy

    ####  Cols group for model input  ###########################################################

    save(colXy, f'{path_pipeline_export}/colsX.pkl')
    save(cols_family_all, f'{path_pipeline_export}/cols_family.pkl')

    ###### Return values  #######################################################################
    return dfXy, cols_family_all
Exemplo n.º 5
0
def text_preprocess(path_train_X="",
                    path_train_y="",
                    path_pipeline_export="",
                    cols_group=None,
                    n_sample=5000,
                    preprocess_pars={},
                    filter_pars={},
                    path_features_store=None):
    """

    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot,
                              pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation ###############################################
    log(cols_group)
    coly = cols_group['coly']  # 'salary'
    colid = cols_group['colid']  # "jobId"
    colcat = cols_group[
        'colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']

    colcross_single = cols_group.get('colcross',
                                     [])  ### List of single columns
    coltext = cols_group.get('coltext', [])
    coldate = cols_group.get('coldate', [])
    colall = colnum + colcat + coltext + coldate
    log(colall)

    ##### Load data ########################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample=n_sample)

    log("##### Coltext processing   ###############################################################"
        )
    from utils import util_text, util_model

    ### Remoe common words  #############################################
    import json
    import string
    punctuations = string.punctuation
    stopwords = json.load(open("stopwords_en.json"))["word"]
    stopwords = [t for t in string.punctuation] + stopwords
    stopwords = ["", " ", ",", ".", "-", "*", '€', "+", "/"] + stopwords
    stopwords = list(set(stopwords))
    stopwords.sort()
    print(stopwords)
    stopwords = set(stopwords)

    def pipe_text(df, col, pars={}):
        ntoken = pars['n_token']
        df = df.fillna("")
        dftext = df
        log(dftext)
        log(col)
        list1 = []
        list1.append(col)

        # fromword = [ r"\b({w})\b".format(w=w)  for w in fromword    ]
        # print(fromword)
        for col_n in list1:
            dftext[col_n] = dftext[col_n].fillna("")
            dftext[col_n] = dftext[col_n].str.lower()
            dftext[col_n] = dftext[col_n].apply(
                lambda x: x.translate(string.punctuation))
            dftext[col_n] = dftext[col_n].apply(
                lambda x: x.translate(string.digits))
            dftext[col_n] = dftext[col_n].apply(
                lambda x: re.sub("[!@,#$+%*:()'-]", " ", x))

            dftext[col_n] = dftext[col_n].apply(
                lambda x: coltext_stopwords(x, stopwords=stopwords))

        print(dftext.head(6))

        sep = " "
        """
        :param df:
        :param coltext:  text where word frequency should be extracted
        :param nb_to_show:
        :return:
        """
        coltext_freq = df[col].apply(
            lambda x: pd.value_counts(x.split(sep))).sum(axis=0).reset_index()
        coltext_freq.columns = ["word", "freq"]
        coltext_freq = coltext_freq.sort_values("freq", ascending=0)
        log(coltext_freq)

        word_tokeep = coltext_freq["word"].values[:ntoken]
        word_tokeep = [t for t in word_tokeep if t not in stopwords]

        dftext_tdidf_dict, word_tokeep_dict = util_text.pd_coltext_tdidf(
            dftext,
            coltext=col,
            word_minfreq=1,
            word_tokeep=word_tokeep,
            return_val="dataframe,param")

        log(word_tokeep_dict)
        ###  Dimesnion reduction for Sparse Matrix
        dftext_svd_list, svd_list = util_model.pd_dim_reduction(
            dftext_tdidf_dict,
            colname=None,
            model_pretrain=None,
            colprefix=col + "_svd",
            method="svd",
            dimpca=2,
            return_val="dataframe,param")
        return dftext_svd_list

    pars = {'n_token': 100}
    dftext1 = None
    for coltext_i in coltext:
        dftext_i = pipe_text(df[[coltext_i]], coltext_i, pars)
        save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)
        dftext1 = pd.concat(
            (dftext1, dftext_i)) if dftext1 is not None else dftext_i
    print(dftext1.head(6))
    dftext1.to_csv(r"" + path_features_store + "\dftext.csv", index=False)

    ##################################################################################################
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['coltext']:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None:
            save(t_val, tfile)
            cols_family[t] = t_val

    return dftext1, cols_family
Exemplo n.º 6
0
def preprocess(path_train_X="", path_train_y="", path_pipeline_export="", cols_group=None, n_sample=5000,
               preprocess_pars={}, filter_pars={}, path_features_store=None):
    """
    :param path_train_X:
    :param path_train_y:
    :param path_pipeline_export:
    :param cols_group:
    :param n_sample:
    :param preprocess_pars:
    :param filter_pars:
    :param path_features_store:
    :return:
    """
    from util_feature import (pd_colnum_tocat, pd_col_to_onehot, pd_colcat_mapping, pd_colcat_toint,
                              pd_feature_generate_cross)

    ##### column names for feature generation #####################################################
    log(cols_group)
    coly            = cols_group['coly']  # 'salary'
    colid           = cols_group['colid']  # "jobId"
    colcat          = cols_group['colcat']  # [ 'companyId', 'jobType', 'degree', 'major', 'industry' ]
    colnum          = cols_group['colnum']  # ['yearsExperience', 'milesFromMetropolis']
    
    colcross_single = cols_group.get('colcross', [])   ### List of single columns
    coltext         = cols_group.get('coltext', [])
    coldate         = cols_group.get('coldate', [])
    colall          = colnum + colcat + coltext + coldate
    log(colall)

    #### Pipeline Execution
    pipe_default    = [ 'filter', 'label', 'dfnum_bin', 'dfnum_hot',  'dfcat_bin', 'dfcat_hot', 'dfcross_hot', ]
    pipe_list       = preprocess_pars.get('pipe_list', pipe_default)
    pipe_list.append('dfdate')
    pipe_list_pars  = preprocess_pars.get('pipe_pars', [])



    ##### Load data ##############################################################################
    df = load_dataset(path_train_X, path_train_y, colid, n_sample= n_sample)

    ##### Filtering / cleaning rows :   #########################################################
    if "filter" in pipe_list :
        def isfloat(x):
            try :
                a= float(x)
                return 1
            except:
                return 0
        ymin, ymax = filter_pars.get('ymin', -9999999999.0), filter_pars.get('ymax', 999999999.0)
        print(coly)
        df['_isfloat'] = df[ coly ].apply(lambda x : isfloat(x))
        print(df['_isfloat'])
        df = df[ df['_isfloat'] > 0 ]
        df = df[df[coly] > ymin]
        df = df[df[coly] < ymax]


    ##### Label processing   ####################################################################
    y_norm_fun = None
    if "label" in pipe_list :
        # Target coly processing, Normalization process  , customize by model
        log("y_norm_fun preprocess_pars")
        y_norm_fun = preprocess_pars.get('y_norm_fun', None)
        if y_norm_fun is not None:
            df[coly] = df[coly].apply(lambda x: y_norm_fun(x))
            save(y_norm_fun, f'{path_pipeline_export}/y_norm.pkl' )
            save_features(df[coly], 'dfy', path_features_store)


    ########### colnum procesing   #############################################################
    for x in colnum:
        print('bam',x)
        df[x] = df[x].astype("float")
    log(df[colall].dtypes)


    if "dfnum" in pipe_list :
        pass


    if "dfnum_norm" in pipe_list :
        log("### colnum normalize  ###############################################################")
        from util_feature import pd_colnum_normalize
        pars = { 'pipe_list': [ {'name': 'fillna', 'naval' : 0.0 }, {'name': 'minmax'} ]}
        dfnum_norm, colnum_norm = pd_colnum_normalize(df, colname=colnum,  pars=pars, suffix = "_norm",
                                                      return_val="dataframe,param")
        log(colnum_norm)
        save_features(dfnum_norm, 'dfnum_norm', path_features_store)


    if "dfnum_bin" in pipe_list :
        log("### colnum Map numerics to Category bin  ###########################################")
        dfnum_bin, colnum_binmap = pd_colnum_tocat(df, colname=colnum, colexclude=None, colbinmap=None,
                                                   bins=10, suffix="_bin", method="uniform",
                                                   return_val="dataframe,param")
        log(colnum_binmap)
        ### Renaming colunm_bin with suffix
        colnum_bin = [x + "_bin" for x in list(colnum_binmap.keys())]
        log(colnum_bin)
        save_features(dfnum_bin, 'dfnum_binmap', path_features_store)


    if "dfnum_hot" in pipe_list and "dfnum_bin" in pipe_list  :
        log("### colnum bin to One Hot")
        dfnum_hot, colnum_onehot = pd_col_to_onehot(dfnum_bin[colnum_bin], colname=colnum_bin,
                                                    colonehot=None, return_val="dataframe,param")
        log(colnum_onehot)
        save_features(dfnum_hot, 'dfnum_onehot', path_features_store)


    ##### Colcat processing   ################################################################
    colcat_map = pd_colcat_mapping(df, colcat)
    log(df[colcat].dtypes, colcat_map)

    if "dfcat_hot" in pipe_list :
        log("#### colcat to onehot")
        dfcat_hot, colcat_onehot = pd_col_to_onehot(df[colcat], colname=colcat,
                                                    colonehot=None, return_val="dataframe,param")
        log(dfcat_hot[colcat_onehot].head(5))
        save_features(dfcat_hot, 'dfcat_onehot', path_features_store)



    if "dfcat_bin" in pipe_list :
        log("#### Colcat to integer encoding ")
        dfcat_bin, colcat_bin_map = pd_colcat_toint(df[colcat], colname=colcat,
                                                    colcat_map=None, suffix="_int")
        colcat_bin = list(dfcat_bin.columns)
        save_features(dfcat_bin, 'dfcat_bin', path_features_store)

    if "dfcross_hot" in pipe_list :
        log("#####  Cross Features From OneHot Features   ######################################")
        try :
           df_onehot = dfcat_hot.join(dfnum_hot, on=colid, how='left')
        except :
           df_onehot = copy.deepcopy(dfcat_hot)

        colcross_single_onehot_select = []
        for t in list(df_onehot) :
            for c1 in colcross_single :
                if c1 in t :
                   colcross_single_onehot_select.append(t)

        df_onehot = df_onehot[colcross_single_onehot_select ]
        dfcross_hot, colcross_pair = pd_feature_generate_cross(df_onehot, colcross_single_onehot_select,
                                                               pct_threshold=0.02,  m_combination=2)
        log(dfcross_hot.head(2).T)
        colcross_pair_onehot = list(dfcross_hot.columns)
        save_features(dfcross_hot, 'dfcross_onehot', path_features_store)
        del df_onehot ,colcross_pair_onehot

    

    if "dftext" in pipe_list :
        log("##### Coltext processing   ###############################################################")
        stopwords = nlp_get_stopwords()
        pars      = {'n_token' : 100 , 'stopwords': stopwords}
        dftext    = None
        
        for coltext_i in coltext :
            
            ##### Run the text processor on each column text  #############################
            dftext_i = pipe_text( df[[coltext_i ]], coltext_i, pars )
            dftext   = pd.concat((dftext, dftext_i), axis=1)  if dftext is not None else dftext_i
            save_features(dftext_i, 'dftext_' + coltext_i, path_features_store)

        log(dftext.head(6))
        save_features(dftext, 'dftext', path_features_store)



    if "dfdate" in pipe_list :
        log("##### Coldate processing   #############################################################")
        from utils import util_date
        dfdate = None
        for coldate_i in coldate :
            dfdate_i =  util_date.pd_datestring_split( df[[coldate_i]] , coldate_i, fmt="auto", return_val= "split" )
            dfdate  = pd.concat((dfdate, dfdate_i), axis=1)  if dfdate is not None else dfdate_i
            save_features(dfdate_i, 'dfdate_' + coldate_i, path_features_store)
        save_features(dfdate, 'dfdate', path_features_store)
        print('spoo',dfdate)


    ###################################################################################
# ###############
    ##### Save pre-processor meta-parameters
    os.makedirs(path_pipeline_export, exist_ok=True)
    log(path_pipeline_export)
    cols_family = {}

    for t in ['colid',
              "colnum", "colnum_bin", "colnum_onehot", "colnum_binmap",  #### Colnum columns
              "colcat", "colcat_bin", "colcat_onehot", "colcat_bin_map",  #### colcat columns
              'colcross_single_onehot_select', "colcross_pair_onehot",  'colcross_pair',  #### colcross columns

              'coldate',
              'coltext',

              "coly", "y_norm_fun"
              ]:
        tfile = f'{path_pipeline_export}/{t}.pkl'
        log(tfile)
        t_val = locals().get(t, None)
        if t_val is not None :
           save(t_val, tfile)
           cols_family[t] = t_val


    ######  Merge AlL  #############################################################################
    dfXy = df[colnum + colcat + [coly] ]
    print('localTT',dfXy)
    for t in [ 'dfnum_bin', 'dfnum_hot', 'dfcat_bin', 'dfcat_hot', 'dfcross_hot',
               'dfdate',  'dftext'  ] :
        if t in locals() :
            print('localT', t, locals()[t])
            dfXy = pd.concat((dfXy, locals()[t] ), axis=1)

    save_features(dfXy, 'dfX', path_features_store)
    colXy = list(dfXy.columns)
    colXy.remove(coly)    ##### Only X columns
    cols_family['colX'] = colXy
    save(colXy, f'{path_pipeline_export}/colsX.pkl' )
    save(cols_family, f'{path_pipeline_export}/cols_family.pkl' )


    ###### Return values  #########################################################################
    return dfXy, cols_family
Exemplo n.º 7
0
def run_predict(config_name,
                config_path,
                n_sample=-1,
                path_data=None,
                path_output=None,
                pars={},
                model_dict=None):

    log("#### Run predict  ###############################################################"
        )
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    model_class = model_dict['model_pars']['model_class']

    m = model_dict['global_pars']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']
    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    log("#### Run preprocess  ###########################################################"
        )
    from run_preprocess import preprocess_inference as preprocess
    colid = load(f'{path_pipeline}/colid.pkl')
    df = load_dataset(path_data,
                      path_data_y=None,
                      colid=colid,
                      n_sample=n_sample)
    dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars)
    coly = cols["coly"]

    log("#### Extract column names  #########################################################"
        )
    ### Actual column names for Model Input :  label y and Input X (colnum , colcat), remove duplicate names
    ###  [  'colcat', 'colnum'
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = list(
        set(
            sum([
                cols[colgroup]
                for colgroup in model_dict['data_pars']['cols_model_group']
            ], [])))

    #### Flatten Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names
    ## 'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type',
                                                       {}).items():
        model_dict['data_pars']['cols_model_type2'][colg] = list(
            set(sum([cols[colgroup] for colgroup in colg_list], [])))

    log("############ Prediction  ##########################################################"
        )
    ypred, yproba = predict(model_class, path_model, dfX, cols, model_dict)

    post_process_fun = model_dict['model_pars']['post_process_fun']
    df[coly + "_pred"] = ypred
    df[coly + "_pred"] = df[coly +
                            '_pred'].apply(lambda x: post_process_fun(x))
    if yproba is not None:
        df[coly + "_pred_proba"] = yproba

    log("############ Saving prediction  ###################################################"
        )
    log(ypred.shape, path_output)
    os.makedirs(path_output, exist_ok=True)
    df.to_csv(f"{path_output}/prediction.csv")
    log(df.head(8))

    log("###########  Export Specific ######################################################"
        )
    df[cols["coly"]] = ypred
    df[[cols["coly"]]].to_csv(f"{path_output}/pred_only.csv")
Exemplo n.º 8
0
def run_transform(config_name,
                  config_path,
                  n_sample=1,
                  path_data=None,
                  path_output=None,
                  pars={},
                  model_dict=None,
                  return_mode=""):

    log("##### Run transform ###############################################################"
        )
    model_dict = model_dict_load(model_dict,
                                 config_path,
                                 config_name,
                                 verbose=True)
    model_class = model_dict['model_pars']['model_class']

    m = model_dict['global_pars']
    path_data = m['path_pred_data'] if path_data is None else path_data
    path_pipeline = m['path_pred_pipeline']  #   path_output + "/pipeline/" )
    path_model = m['path_pred_model']

    model_file = m.get('model_file', "")  ### New

    path_output = m['path_pred_output'] if path_output is None else path_output
    log(path_data, path_model, path_output)

    pars = {
        'cols_group': model_dict['data_pars']['cols_input_type'],
        'pipe_list': model_dict['model_pars']['pre_process_pars']['pipe_list']
    }

    log("##### Load Preprocess ############################################################"
        )
    from run_preprocess import preprocess_inference as preprocess
    colid = load(f'{path_pipeline}/colid.pkl')
    if model_class in SUPERVISED_MODELS:
        path_pred_X = m.get('path_pred_X', path_data + "/features.zip")  #.zip
        path_pred_y = m.get('path_pred_y', path_data + "/target.zip")  #.zip
        df = load_dataset(path_pred_X, path_pred_y, colid, n_sample=n_sample)
    else:
        df = load_dataset(path_data, None, colid, n_sample=n_sample)

    dfX, cols = preprocess(df, path_pipeline, preprocess_pars=pars)
    coly = cols["coly"]

    log("#### Extract column names  #######################################################"
        )
    ### Actual column names for Model Input :  label y and Input X (colnum , colcat), remove duplicate names
    model_dict['data_pars']['coly'] = cols['coly']
    model_dict['data_pars']['cols_model'] = list(
        set(
            sum([
                cols[colgroup]
                for colgroup in model_dict['data_pars']['cols_model_group']
            ], [])))

    ####    Col Group by column type : Sparse, continuous, .... (ie Neural Network feed Input, remove duplicate names
    ####   'coldense' = [ 'colnum' ]     'colsparse' = ['colcat' ]
    model_dict['data_pars']['cols_model_type2'] = {}
    for colg, colg_list in model_dict['data_pars'].get('cols_model_type',
                                                       {}).items():
        model_dict['data_pars']['cols_model_type2'][colg] = list(
            set(sum([cols[colgroup] for colgroup in colg_list], [])))

    log("############ Task Inference   ###################################################"
        )
    task_type = model_dict['compute_pars'].get('task_inference', 'transform')
    if model_class in SUPERVISED_MODELS:
        dfXy = transform(
            model_file,
            path_model,
            (dfX[[c for c in dfX.columns if c not in coly]], df[coly]),
            model_dict,
            task_type=task_type)
    else:
        dfXy = transform(model_file,
                         path_model,
                         dfX,
                         model_dict,
                         task_type=task_type)

    post_process_fun = model_dict['model_pars']['post_process_fun']

    if return_mode == 'dict':
        return {'dfXy': dfXy}

    else:
        log("#### Export ##################################################################"
            )
        path_check_out = m.get('path_check_out', path_output + "/check/")
        os.makedirs(path_check_out, exist_ok=True)
        dfX.to_parquet(path_check_out +
                       "/dfX.parquet")  # train input data generate parquet
        log(
            "######### Finish #############################################################",
        )