def test_evalAEF5():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("AEF5_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("AEF5")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 0.99
def test_evalefr015():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("EFR015_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("EFR015")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 1.34
def test_evalNEFauc25():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("NEFAUC25_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("NEFAUC25")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 0.55
def test_evalprauc():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("PRAUC_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("PRAUC")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 0.51
def test_evalReliabilityScore():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("ReliabilityScore_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("ReliabilityScore")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 0.25
def test_evalLogloss():
    evaluation = definedEvaluation()
    e = evaluation.eval_function("Logloss_0")
    value0 = e(bin_pred, dtrain_cont)
    evaluation = definedEvaluation()
    e = evaluation.eval_function("Logloss")
    value1 = e(bin_pred, dtrain_bin)
    assert value0 == value1
    assert np.round([value0[1]], 2) == 1504.69
 def __init__(self,xgbData,eval_name,model_type,model_name):
     """
     Parameters:
     -----------
     xgbData: object
       Default data object that contains training data, testing data, cv-fold
       info, and label.
     eval_name: str
       Name of evaluation metric used to monitor training process. Must in
       pre-defined evaluation list.
     model_type: str
       Name of model type you want to use. Must in pre-defined model type list.
     model_name: str
       Unique name for this model.
     """
     self.name = model_name
     self.__preDefined_model = defined_model.definedModel()
     self.__DEFINED_MODEL_TYPE = self.__preDefined_model.model_type()
     self.__preDefined_eval = defined_eval.definedEvaluation()
     self.__DEFINED_EVAL = self.__preDefined_eval.eval_list()
     self.__xgbData = xgbData
     self.__preDefined_eval.validate_eval_name(eval_name)
     self.__eval_name = eval_name
     self.__preDefined_model.validate_model_type(model_type)
     self.__model_type_writeout = model_type
     self.__collect_model = None
     self.__track_best_ntree = pd.DataFrame(columns = ['model_name','best_ntree'])
     self.__best_score = list()
     self.__param = self.__preDefined_model.model_param(model_type)
     self.__eval_function = self.__preDefined_eval.eval_function(self.__eval_name)
     self.__MAXIMIZE = self.__preDefined_eval.is_maximize(self.__eval_name)
     self.__STOPPING_ROUND = self.__preDefined_eval.stopping_round(self.__eval_name)
     self.__holdout = None
def test_defined_eval():
    eval = defined_eval.definedEvaluation()
    assert eval.is_maximize('ROCAUC') == True
    assert eval.stopping_round('ROCAUC') == 100
    mark = 0
    try:
        eval.is_maximize('not_exist_eval_name')
        assert mark == 1
    except ValueError:
        mark = 1
 def __init__(self,xgbData,list_firstLayerModel,eval_name,model_type,model_name):
     """Use holdout(out of fold) predictions from several firstLayerModels as
     training features to train a secondLayerModel.(So called stacking model)
     Parameters:
     -----------
     xgbData: object
      contains the label you want to use in second layer model.
     list_firstLayerModel: list
      list contains firstLayerModel.
     eval_name: str
       Name of evaluation metric used to monitor training process. Must in
       pre-defined evaluation list.
     model_type: str
       Name of model type you want to use. Must in pre-defined model type list.
     model_name: str
       Unique name for this model.
     """
     self.name = model_name
     self.__preDefined_model = defined_model.definedModel()
     self.__preDefined_eval = defined_eval.definedEvaluation()
     self.__DEFINED_EVAL = self.__preDefined_eval.eval_list()
     self.__xgbData = xgbData
     assert all([isinstance(item,first_layer_model.firstLayerModel) for item in list_firstLayerModel])
     self.__list_firstLayerModel = list_firstLayerModel
     self.__preDefined_eval.validate_eval_name(eval_name)
     self.__eval_name = eval_name
     self.__preDefined_model.validate_model_type(model_type)
     self.__model_type_writeout = model_type
     self.__collect_model = None
     self.__track_best_ntree = pd.DataFrame(columns = ['model_name','best_ntree'])
     self.__best_score = list()
     self.__firstLayerModel_prediction = None
     self.__param = self.__preDefined_model.model_param(model_type)
     self.__eval_function = self.__preDefined_eval.eval_function(self.__eval_name)
     self.__MAXIMIZE = self.__preDefined_eval.is_maximize(self.__eval_name)
     self.__STOPPING_ROUND = self.__preDefined_eval.stopping_round(self.__eval_name)
     self.__holdout = None
    # Read command line inpu and match input.
    with open(sys.argv[1], 'r') as f:
        info = f.read()
    info = pd.read_json(info)
    target_name = np.str(info.loc['target_name'][0])
    dir_train = np.str(info.loc['full_directory_to_training_data'][0])
    dir_test = np.str(info.loc['full_directory_to_dataToPredict_if_exit'][0])
    smile_colname = np.str(info.loc['smile_column_name'][0])
    label_name_list = info.loc['label_name_list'][0]
    label_name_list = [np.str(item) for item in label_name_list]
    eval_name = np.str(info.loc['evaluation_name'][0])
    dir_to_store = np.str(info.loc['full_directory_to_store_prediction'][0])
    maccKeys_column_name = np.str(info.loc['maccKeys_column_name'][0])
    ecfp1024_column_name = np.str(info.loc['ecfp1024_column_name'][0])

    preDefined_eval = defined_eval.definedEvaluation()
    preDefined_eval.validate_eval_name(eval_name)
    df = pd.read_csv(dir_train)
    # identify NA row.
    missing_row = pd.isnull(df.loc[:, label_name_list[0]])
    df = df.loc[~missing_row]
    df = df.reset_index(drop=True)
    print 'Preparing training data fingerprints'
    # morgan(ecfp) fp
    morgan_fp = df.copy()
    morgan_fp = morgan_fp.rename(columns={'ecfp1024': 'fingerprint'})
    # MACCSkeys fp
    maccs_fp = df.copy()
    maccs_fp = maccs_fp.rename(columns={'maccKeys': 'fingerprint'})
    comb1 = (morgan_fp, label_name_list)
    comb2 = (maccs_fp, label_name_list)
Пример #11
0
    def train(self):
        """
        Train the model. Train and check potential first and second layer models.
        """
        evaluation_metric_name = self.__eval_name
        print 'Building first layer models'
        #---------------------------------first layer models ----------
        for data_dict in self.__setting_list:
            for model_type in data_dict['model_type']:
                unique_name = 'layer1_' + data_dict['data_name'] + '_' + model_type + '_' + evaluation_metric_name
                model = first_layer_model.firstLayerModel(data_dict['data'],
                        evaluation_metric_name,model_type,unique_name)
                # Retrieve default parameter and change default seed.
                default_param,default_MAXIMIZE,default_STOPPING_ROUND = model.get_param()
                default_param['seed'] = self.seed
                if self.__verbose == True:
                    default_param['silent'] = 1
                elif self.__verbose == False:
                    default_param['verbose_eval'] = False
                model.update_param(default_param,default_MAXIMIZE,default_STOPPING_ROUND)
                model.xgb_cv()
                model.generate_holdout_pred()
                self.__layer1_model_list.append(model)

        #------------------------------------second layer models
        layer2_label_data = self.__setting_list[0]['data'] # layer1 data object containing the label for layer2 model
        layer2_modeltype = ['GbtreeLogistic','GblinearLogistic']
        layer2_evaluation_metric_name = [self.__eval_name]
        print 'Building second layer models'
        for evaluation_metric_name in layer2_evaluation_metric_name:
            for model_type in layer2_modeltype:
                unique_name = 'layer2' + '_' + model_type + '_' + evaluation_metric_name
                l2model = second_layer_model.secondLayerModel(layer2_label_data,self.__layer1_model_list,
                            evaluation_metric_name,model_type,unique_name)
                l2model.second_layer_data()
                # Retrieve default parameter and change default seed.
                default_param,default_MAXIMIZE,default_STOPPING_ROUND = l2model.get_param()
                default_param['seed'] = self.seed
                if self.__verbose == True:
                    default_param['silent'] = 0
                elif self.__verbose == False:
                    default_param['verbose_eval'] = False
                l2model.update_param(default_param,default_MAXIMIZE,default_STOPPING_ROUND)
                l2model.xgb_cv()
                self.__layer2_model_list.append(l2model)


        #------------------------------------ evaluate model performance on test data
        # prepare test data, retrive from layer1 data
        list_TestData = []
        for data_dict in self.__setting_list:
            for model_type in data_dict['model_type']:
                list_TestData.append(data_dict['data'].get_dtest())
        test_label = layer2_label_data.get_testLabel()
        test_result_list = []
        i = 0
        for evaluation_metric_name in layer2_evaluation_metric_name:
            for model_type in layer2_modeltype:
                test_result = eval_testset.eval_testset(self.__layer2_model_list[i],
                                                        list_TestData,test_label,
                                                        evaluation_metric_name)
                test_result_list.append(test_result)
                i += 1

        # merge cv and test result together. Calcuate the weighted average of
        # cv and test result for each model(layer1, layer2 model). Then use the best
        # model to predict.
        all_model = self.__layer1_model_list + self.__layer2_model_list
        result = []
        for model in all_model:
            result = result + [item for item in np.array(model.cv_score_df())[0]]
        # Retrieve corresponding name of cv result
        result_index = []
        for model in all_model:
            result_index.append(model.name)
        # create a dataframe
        cv_result = pd.DataFrame({'cv_result' : result},index = result_index)

        test_result = pd.concat(test_result_list,axis = 0,ignore_index=False)
        test_result = test_result.rename(columns = {self.__eval_name:'test_result'})
        #selet distinct row.
        test_result['temp_name'] = test_result.index
        test_result = test_result.drop_duplicates(['temp_name'])
        test_result = test_result.drop('temp_name',1)
        cv_test = pd.merge(cv_result,test_result,how='left',left_index=True,right_index=True)
        self.__num_folds = np.float64(self.__num_folds)
        cv_test['weighted_score'] = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds)
        weighted_score = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds)

        # Determine does current evaluation metric need to maximize or minimize
        eval_info = defined_eval.definedEvaluation()
        is_max = eval_info.is_maximize(self.__eval_name)
        if is_max:
            position = np.where(cv_test.weighted_score == cv_test.weighted_score.max())
            best_model_name = cv_test.weighted_score.iloc[position].index[0]
        else:
            position = np.where(cv_test.weighted_score == cv_test.weighted_score.min())
            best_model_name = cv_test.weighted_score.iloc[position].index[0]
        # find best model
        all_model_name = [model.name for model in all_model]
        model_position = all_model_name.index(best_model_name)
        self.__best_model = all_model[model_position]
        self.__best_model_result = pd.DataFrame(cv_test.loc[self.__best_model.name])
        self.__all_model_result = cv_test
Пример #12
0
    def __prepare_result(self):
        # merge cv and test result together. Calcuate the weighted average of
        # cv and test result for each model(layer1, layer2 model). Then use the best
        # model to predict.
        all_model = self.__layer1_model_list + self.__layer2_model_list
        result = []
        for model in all_model:
            result = result + [item for item in np.array(model.cv_score_df())[0]]
        # Retrieve corresponding name of cv result
        result_index = []
        for model in all_model:
            result_index.append(model.name)
        # create a dataframe
        cv_result = pd.DataFrame({'cv_result' : result},index = result_index)

        #------------------------------------ evaluate model performance on test data
        # prepare test data, retrive from layer1 data
        if self.__createTestset:
            list_TestData = []
            layer2_modeltype = self.__layer2_modeltype
            for data_dict in self.__setting_list:
                for model_type in data_dict['model_type']:
                    list_TestData.append(data_dict['data'].get_dtest())
            test_label = layer2_label_data.get_testLabel()
            test_result_list = []
            i = 0
            for evaluation_metric_name in layer2_evaluation_metric_name:
                for model_type in layer2_modeltype:
                    test_result = eval_testset.eval_testset(self.__layer2_model_list[i],
                                                            list_TestData,test_label,
                                                            evaluation_metric_name)
                    test_result_list.append(test_result)
                    i += 1
            test_result = pd.concat(test_result_list,axis = 0,ignore_index=False)
            test_result = test_result.rename(columns = {self.__eval_name:'test_result'})
            #selet distinct row.
            test_result['temp_name'] = test_result.index
            test_result = test_result.drop_duplicates(['temp_name'])
            test_result = test_result.drop('temp_name',1)

            cv_test = pd.merge(cv_result,test_result,how='left',left_index=True,right_index=True)
            self.__num_folds = np.float64(self.__num_folds)
            cv_test['weighted_score'] = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds)
            weighted_score = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds)
        else:
            cv_test = cv_result
            cv_test['weighted_score'] = cv_result.cv_result

        # Based on user specific finalModel
        if self.__finalModel == None:
            final_cv_test = cv_test
        else:
            finalModel_names = [item for item in list(cv_test.index) if self.__finalModel in item]
            final_cv_test = cv_test.loc[finalModel_names]

        # Determine does current evaluation metric need to maximize or minimize
        eval_info = defined_eval.definedEvaluation()
        is_max = eval_info.is_maximize(self.__eval_name)
        if is_max:
            position = np.where(final_cv_test.weighted_score == final_cv_test.weighted_score.max())
            best_model_name = final_cv_test.weighted_score.iloc[position].index[0]
        else:
            position = np.where(final_cv_test.weighted_score == final_cv_test.weighted_score.min())
            best_model_name = final_cv_test.weighted_score.iloc[position].index[0]
        # find best model
        all_model_name = [model.name for model in all_model]
        model_position = all_model_name.index(best_model_name)
        self.__best_model = all_model[model_position]
        self.__best_model_result = pd.DataFrame(cv_test.loc[self.__best_model.name])
        self.__all_model_result = cv_test
        # Find model contains the final label
        if self.__final_labelType == 'binary':
            model_has_finalLabel = [item for item in list(cv_test.index) if 'Logistic' in item]
            model_position = all_model_name.index(model_has_finalLabel[0])
            self.__model_has_finalLabel = all_model[model_position]
        elif self.__final_labelType == 'continuous':
            model_has_finalLabel = [item for item in list(cv_test.index) if 'Regression' in item]
            model_position = all_model_name.index(model_has_finalLabel[0])
            self.__model_has_finalLabel = all_model[model_position]