Exemplo n.º 1
0
def train(max_depth, n_rounds):

    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    train_size = 200000
    train_size2 = 25000
    valid_size = 25000
    remove_999 = False

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, train2_s, valid_s,  test_s = tokenizer.extract_data(split= split, \
                                             normalize= normalize, \
                                             remove_999 = remove_999, \
                                             noise_variance= noise_var, \
                                             n_classes = "multiclass", \
                                             train_size = train_size, \
                                             train_size2 = train_size2, \
                                             valid_size = valid_size)

    
    #RANDOM FOREST:
    #kwargs_grad = {}
    #kwargs_rdf = {'n_estimators': 100}
    print "Training on the train set ..."
    #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf)

    #XGBOOST
    kwargs_xgb = {'bst_parameters': \
                {'booster_type': 0, \
                     #'objective': 'binary:logitraw',
                     'objective': 'multi:softprob', 'num_class': 5,
                     'bst:eta': 0.1, # the bigger the more conservative
                     'bst:subsample': 1, # prevent over fitting if <1
                     'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1,
                     'nthread': 8 }, \
                'n_rounds': n_rounds}

    predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb)
    
    #TEST / SUBMISSION
    """
    yProbaTest_s = []
    yProbaTestBinary_s = []

    print "Classifying the test set..."
    for i in range(8):
        yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i])
        yProbaTest_s.append(yProbaTest)
    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTest_s[i].shape[0]):
            yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s)
    IDs = preTreatment.concatenate_vectors(test_s[0])


    yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary)
    
    yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15)

    s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") 

    
    """
    # TRAIN AND VALID
    
    yPredictedTrain2_s = []
    yProbaTrain2_s = []
    yProbaTrain2Binary_s = []
    yPredictedValid_s = []
    yProbaValid_s = []
    yProbaValidBinary_s = []

    print "Classifying the train2 set..."
    for i in range(8):
        yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i])
        yProbaTrain2_s.append(yProbaTrain2)
    print "Classifying the valid set..."
    for i in range(8):
        yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i])
        yProbaValid_s.append(yProbaValid)

    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0]))
        yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTrain2_s[i].shape[0]):
            yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0]
        for j in range(yProbaValid_s[i].shape[0]):
            yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s)
    yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s)
    yTrain2 = preTreatment.concatenate_vectors(train2_s[2])
    yValid = preTreatment.concatenate_vectors(valid_s[2])
    weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3])
    weightsValid = preTreatment.concatenate_vectors(valid_s[3])

    print "Putting all the real labels to 1"
    yTrain2 = preTreatment.multiclass2binary(yTrain2)
    yValid = preTreatment.multiclass2binary(yValid)

    print "Getting the best ratios..."
    best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2)
    #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1)

    yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15)
    yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global)
    #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison)

    #Let's compute the predicted AMS
    s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid)
    AMS = hbc.AMS(s,b)
    #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid)
    #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison)
    s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid)
    AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global)

    print "AMS 0.15 = %f" %AMS
    print " "
    #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison
    #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison
    #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison)
    print " "
    print "best AMS valid ratio global= %f" %AMS_best_ratio_global
    print "best AMS train2 ratio global= %f" %best_ams_train2_global
    print "best ratio global train2 = %f" %best_ratio_global
 

    return AMS
Exemplo n.º 2
0
                                                                yTrain2,
                                                                weightsTrain2)


        print "Train2 - best ratio : %s - best ams : %f" \
                %(', '.join(map(str,best_ratio)), best_ams_train2)
        print(" ")


        print "Making predictions on the validation set..."
        # Prediction of the validation set 2:
        predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1])

        # Thresholding the predictions:
        predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s)
        predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2,
                                                             best_ratio)

        # Binarize the prediction:
        predLabel_Valid2 = preTreatment.multiclass2binary(predLabel5_Valid2)

        # Concatenate data:
        yValid2 = preTreatment.concatenate_vectors(valid_RM_s_2[2])
        weightsValidation = preTreatment.concatenate_vectors(valid_RM_s_2[3])

        # Estimation the AMS:
        s, b = submission.get_s_b(predLabel_Valid2, yValid2, weightsValidation)
        s *= 250000/predLabel_Valid2.shape[0]
        b *= 250000/predLabel_Valid2.shape[0]
        ams = hbc.AMS(s,b)

        print "Valid_RM_2 - ratio : %f - best ams : %f" %(best_ratio, ams)
Exemplo n.º 3
0
def analyse(train_s, train2_s, valid_s, method_name, kwargs={}):
    """
    methode name = string, name of the method (eg :"naiveBayes")
    kwargs = dictionnary of the paraters of the method
    train_s = training set for the classifier(s)
    train2_s = training set for the meta parameters (eg : the best treshold)
    valid_s : validation set
    None of the set must be empty !
    """
    # Prediction on the validation set:
    print("------------------- Analyse: %s -----------------------") \
                        %(method_name)

    classifier_s = eval(method_name).train_classifier(train_s[1], train_s[2],
                                                      kwargs)

    yProbaTrain2_s = eval(method_name).predict_proba(classifier_s, train2_s[1])
    yProbaValid_s = eval(method_name).predict_proba(classifier_s, valid_s[1])

    # Convert the validations vectors four 's' classes into one single s
    # classe
    if type(valid_s[2]) == list:
        for i in range(len(valid_s[2])):
            for j in range(valid_s[2][i].shape[0]):
                if valid_s[2][i][j] >=1:
                    valid_s[2][i][j] = 1

    # Convert the train2 vectors four 's' classes into one single s
    # classe
    if type(train2_s[2]) == list:
        for i in range(len(train2_s[2])):
            for j in range(train2_s[2][i].shape[0]):
                if train2_s[2][i][j] >=1:
                    train2_s[2][i][j] = 1

    # Let's define the vectors of probabilities of being 's'
    # Train2 set
    if type(yProbaTrain2_s) == list:
        yProbaTrain2Binary_s = []
        for i in range(8):
            yProbaTrain2Binary_s.append(np.zeros(len(yProbaTrain2_s[i][:,1])))
        for i in range(8):
            for j in range(len(yProbaTrain2_s[i][:,1])):
                yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0]
    else:
        yProbaTrain2Binary_s = np.zeros(len(yProbaTrain2_s[i][:,1]))
        for j in range(len(yProbaTrain2_s[i][:,1])):
            yProbaTrain2Binary_s[j] = 1 - yProbaTrain2_s[j][0]

    # Validation set
    if type(yProbaValid_s) == list:
        yProbaValidBinary_s = []
        for i in range(8):
            yProbaValidBinary_s.append(np.zeros(len(yProbaValid_s[i][:,1])))
        for i in range(8):
            for j in range(len(yProbaValid_s[i][:,1])):
                yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0]
    else:
        yProbaValidBinary_s = np.zeros(len(yProbaValid_s[i][:,1]))
        for j in range(len(yProbaValid_s[i][:,1])):
            yProbaValidBinary_s[j] = 1 - yProbaValid_s[j][0]

    # If we work with lists, let's get the concatenated vectors:
    # TRAIN SET
    if type(train_s[3]) ==list:
        weightsTrain_conca = preTreatment.concatenate_vectors(train_s[3])
    else:
        weightsTrain_conca = train_s[3]
    # VALID SET
    # Validation Vectors
    if type(valid_s[2]) == list:
        yValid_conca = preTreatment.concatenate_vectors(valid_s[2])
    else:
        yValid_conca = valid_s[2]
    # Weights Vectors
    if type(valid_s[3]) == list:
        weightsValid_conca = preTreatment.concatenate_vectors(valid_s[3])
    else:
        weightsValid_conca = valid_s[3]
    # Binary Proba Vectors
    if type(yProbaValidBinary_s) == list:
        yProbaValidBinary_conca = preTreatment.concatenate_vectors(
                                                              yProbaValidBinary_s)
    else:
        yProbaValidBinary_conca = yProbaValidBinary_s
    # All Proba Vectors
    if type(yProbaValid_s) == list:
        yProbaValid_conca = preTreatment.concatenate_vectors(yProbaValid_s)
    else:
        yProbaValid_conca = yProbaValid_s

    #TRAIN2 SET
    # Validation Vectors
    if type(train2_s[2]) == list:
        yTrain2_conca = preTreatment.concatenate_vectors(train2_s[2])
    else:
        yTrain2_conca = train2_s[2]
    # Weights Vectors
    if type(train2_s[3]) == list:
        weightsTrain2_conca = preTreatment.concatenate_vectors(train2_s[3])
    else:
        weightsTrain2_conca = train2_s[3]
    # Binary Proba Vectors
    if type(yProbaTrain2Binary_s) == list:
        yProbaTrain2Binary_conca = preTreatment.concatenate_vectors(
                                                            yProbaTrain2Binary_s)
    else:
        yProbaTrain2Binary_conca = yProbaTrain2Binary_s
    # All Proba Vectors
    if type(yProbaTrain2_s) == list:
        yProbaTrain2_conca = preTreatment.concatenate_vectors(yProbaTrain2_s)
    else:
        yProbaTrain2_conca = yProbaTrain2_s

    # Let's rebalance the weight so their sum is equal to the total sum
    # of the train set
    sumWeightsTotal = sum(weightsTrain_conca)+sum(weightsTrain2_conca)+sum(weightsValid_conca)
    weightsTrain2_conca *= sumWeightsTotal/sum(weightsTrain2_conca)
    weightsValid_conca *= sumWeightsTotal/sum(weightsValid_conca)
    for i in range(8):
        train2_s[3][i] *= sumWeightsTotal/sum(weightsTrain2_conca)
        valid_s[3][i] *= sumWeightsTotal/sum(weightsValid_conca)

    # Let's get the best global treshold on the train2 set
    AMS_treshold_train2, best_treshold_global = tresholding.\
                            best_treshold(yProbaTrain2Binary_conca,
                                          yTrain2_conca,
                                          weightsTrain2_conca)

    yPredictedValid_conca_treshold = tresholding.get_yPredicted_treshold(
                                                        yProbaValidBinary_conca,
                                                        best_treshold_global)

    # Let's get the best ratio treshold on the train2 set
    AMS_ratio_global_train2, best_ratio_global = tresholding.\
                                    best_ratio(yProbaTrain2Binary_conca,
                                               yTrain2_conca,
                                               weightsTrain2_conca)

    yPredictedValid_conca_ratio_global = tresholding.get_yPredicted_ratio(
                                                        yProbaValidBinary_conca,
                                                        best_ratio_global)
    # Let's get the best ratios combinaison
    if type(train_s[2]) == list:
        AMS_ratio_combinaison_train2, best_ratio_combinaison = tresholding.\
                                best_ratio_combinaison_global(
                                                           yProbaTrain2Binary_s,
                                                           train2_s[2],
                                                           train2_s[3],
                                                           30)

        yPredictedValid_ratio_comb_s, yPredictedValid_conca_ratio_combinaison =\
                                tresholding.get_yPredicted_ratio_8(
                                                    yProbaValidBinary_s,
                                                    best_ratio_combinaison)

    # Let's compute the final s and b for each method
    s_treshold, b_treshold = submission.get_s_b(
                                                yPredictedValid_conca_treshold,
                                                yValid_conca,
                                                weightsValid_conca)
    s_ratio_global, b_ratio_global = submission.get_s_b(
                                            yPredictedValid_conca_ratio_global,
                                            yValid_conca,
                                            weightsValid_conca)
    if type(train_s[2]) == list:
        s_ratio_combinaison, b_ratio_combinaison = submission.get_s_b(
                                        yPredictedValid_conca_ratio_combinaison,
                                        yValid_conca,
                                        weightsValid_conca)

    # AMS final:
    AMS_treshold_valid = hbc.AMS(s_treshold, b_treshold)
    AMS_ratio_global_valid = hbc.AMS(s_ratio_global, b_ratio_global)
    if type(train_s[2]) == list:
        AMS_ratio_combinaison_valid = hbc.AMS(s_ratio_combinaison,
                                              b_ratio_combinaison)

    """
    #AMS by group:
    if type(train_s[2]) == list:
        AMS_s = []
        for i, (s,b) in enumerate(zip(s_s, b_s)):
            s *= 250000/yPredictedValid_s[i].shape[0]
            b *= 250000/yPredictedValid_s[i].shape[0]
            score = hbc.AMS(s,b)
            AMS_s.append(score)
    """
    # Classification error:
    classif_succ_treshold = eval(method_name).get_classification_error(
                                                   yPredictedValid_conca_treshold,
                                                   yValid_conca,
                                                   normalize= True)

    classif_succ_ratio_global = eval(method_name).get_classification_error(
                                               yPredictedValid_conca_ratio_global,
                                               yValid_conca,
                                               normalize= True)

    classif_succ_ratio_combinaison = eval(method_name).get_classification_error(
                                          yPredictedValid_conca_ratio_combinaison,
                                          yValid_conca,
                                          normalize= True)

    # Numerical score:
    """
    if type(yProbaValid_s) == list:
        sum_s_treshold_s = []
        sum_b_treshold_s = []
        sum_s_ratio_global_s = []
        sum_b_ratio_global_s = []
        sum_s_ratio_combinaison_s = []
        sum_b_ratio_combinaison_s = []
        for i in range(len(yPredictedValid_s)):
            # treshold
            sum_s_treshold, sum_b_treshold = submission.get_numerical_score(yPredictedValid_conca_treshold_s[i],
                                                          valid_s[2][i])
            sum_s_treshold_s.append(sum_s)
            sum_b_treshold_s.append(sum_b)
            # ratio global
            sum_s_ratio_global, sum_b_ratio_global = submission.get_numerical_score(yPredictedValid_conca_ratio_global_s[i],
                                                          valid_s[2][i])
            sum_s_ratio_global_s.append(sum_s_ratio_global)
            sum_b_ratio_global_s.append(sum_b_ratio_global)
            # ratio combinaison
            sum_s_ratio_combinaison, sum_b_ratio_combinaison = submission.get_numerical_score(yPredictedValid_conca_ratio_combinaison_s[i],
                                                          valid_s[2][i])
            sum_s_ratio_combinaison_s.append(sum_s_ratio_combinaison)
            sum_b_ratio_combinaison_s.append(sum_b_ratio_combinaison)




    else:
        sum_s, sum_b = submission.get_numerical_score(yPredictedValid_s,
                                                           valid_s[2])
    """
    d = {'classifier_s':classifier_s,
         'yPredictedValid_conca_treshold': yPredictedValid_conca_treshold,
         'yPredictedValid_conca_ratio_global' : \
                 yPredictedValid_conca_ratio_global,
         'yProbaTrain2_s': yProbaTrain2_s,
         'yProbaTrain2Binary_s': yProbaTrain2Binary_s,
         'yProbaTrain2_conca': yProbaTrain2_conca,
         'yProbaTrain2Binary_conca': yProbaTrain2Binary_conca,
         'yProbaValid_s':yProbaValid_s,
         'yProbaValidBinary_s':yProbaValidBinary_s,
         'yProbaValid_conca':yProbaValid_conca,
         'yProbaValidBinary_conca': yProbaValidBinary_conca,
         'AMS_treshold_train2':AMS_treshold_train2,
         'AMS_ratio_global_train2':AMS_ratio_global_train2,
         'AMS_treshold_valid':AMS_treshold_valid,
         'AMS_ratio_global_valid':AMS_ratio_global_valid,
         'best_treshold_global' : best_treshold_global,
         'best_ratio_global':best_ratio_global,
         'classif_succ_treshold': classif_succ_treshold,
         'classif_succ_ratio_global': classif_succ_ratio_global,
         'method': method_name,
         'parameters': kwargs}

    if type(train_s[2])==list:
        d['yPredictedValid_conca_ratio_combinaison'] = yPredictedValid_conca_ratio_combinaison
        d['AMS_ratio_combinaison_train2'] = AMS_ratio_combinaison_train2
        d['AMS_ratio_combinaison_valid'] = AMS_ratio_combinaison_valid,
        d['best_ratio_combinaison'] = best_ratio_combinaison,
        d['classif_succ_ratio_combinaison'] = classif_succ_ratio_combinaison

    return d