Exemplo n.º 1
0
def best_ratio(yProba, yValidation, weightsValidation, pas = 0.01):
    ratio_s = np.arange(0., 0.99, pas)
    best_ams = 0.

    for ratio in ratio_s:


        yPredicted = get_yPredicted_ratio(yProba, ratio)

        s, b = submission.get_s_b(yPredicted, yValidation, weightsValidation)

        if b >= 0. and s >= 0.:
            ams = hbc.AMS(s,b)
            if ams >= best_ams:
                best_ratio = ratio
                best_ams = ams
        else:
            if b < 0.:
                print ("WARNING: For a ratio of %f, b < 0 (b= %f).") %(ratio, b)
                print ("This ratio has been ignored.")
            else:
                print ("WARNING: For a ratio of %f, s < 0 (s= %f).") %(ratio, s)
                print ("This ratio has been ignored.")
        ams = hbc.AMS(s,b)
        if ams >= best_ams:
            best_ratio = ratio
            best_ams = ams

    return best_ams, best_ratio
Exemplo n.º 2
0
def best_treshold(yProba, yValidation, weightsValidation, pas = 0.01):
    """
    Returns the treshold that maximises the AMS for the vectors of proba given
    yProba : vectors of the probabilities computed with a classifier
    yValid : vectors of the true label of the data
    yWeights : vectors of the weights
    pas : size of the interval between two probabilities tested
    The weights must be balanced !
    """

    treshold_s = np.arange(0., 1.0, pas)
    best_ams = 0.

    for treshold in treshold_s:
        yPredicted_prov = get_yPredicted_treshold(yProba, treshold)

        # if we work with multi-class:
        if len(yPredicted_prov.shape) == 2:
            if yPredicted_prov.shape[1] == 5:
                # Reduce multiclass to binary
                yPredicted = np.ones(yPredicted_prov.shape[0])
                yPredicted[yPredicted_prov[:,4] == 0] = 0
            else:
                print "Error: in best_treshold() the shape of the input isn't correct"
        else:
            yPredicted = yPredicted_prov

        s, b = submission.get_s_b(yPredicted, yValidation, weightsValidation)
        ams = hbc.AMS(s,b)
        if ams >= best_ams:
            best_treshold = treshold
            best_ams = ams

    return best_ams, best_treshold
Exemplo n.º 3
0
def evaluate_AMS(final_prediction, valid_s):

    # Get s and b for each group (s_s, b_s) and the final final_s and
    # final_b:

    if type(final_prediction[2]) == list:
        y_predicted_s = list(zip(*final_prediction)[2])
        final_s, final_b, s_s, b_s = submission.get_s_b(y_predicted_s, valid_s[2],
                                                                    valid_s[3])
    else:
        y_predicted_s = final_prediction[2]
        final_s, final_b = submission.get_s_b(y_predicted_s, valid_s[2],
                                                            valid_s[3])

    # Balance the s and b
    final_s *= 250000/25000
    final_b *= 250000/25000

    # AMS final:
    AMS = hbc.AMS(final_s , final_b)
    print ("Expected AMS score for the 'on-top' classifier : %f") %AMS

    #AMS by group
    if type(y_predicted_s) == list:
        AMS_s = []
        for i, (s,b) in enumerate(zip(s_s, b_s)):
            s *= 250000/y_predicted_s[i].shape[0]
            b *= 250000/y_predicted_s[i].shape[0]
            score = hbc.AMS(s,b)
            AMS_s.append(score)
            print("Expected AMS score for the 'on-top' classifer: group %i : %f" \
                %(i, score))

            print(" ")

    else:
        AMS_s = AMS

    return final_s, final_b, AMS, AMS_s
Exemplo n.º 4
0
def best_ratio_combinaison(yProba_s, yValidation_s, weightsValidation_s, ratio_s):
    """
    returns the best ratio combinaison with the ratios specified in ratio_s for each
    group
    ratio_s : List of the list of the ratios to test for each group
    the size of each list should not exceed 4 for computationnal time issues
    """
    best_ratio_comb = [0.,0.,0.,0.,0.,0.,0.,0.]
    AMS_max = 0.
    """
    ratio_1_s = [0.06, 0.08,0.10,0.12]
    ratio_2_s = [0.15,0.16,0.17,0.18]
    ratio_3_s = [0.36,0.38,0.40,0.42]
    ratio_4_s = [0.16,0.18,0.2,0.22]
    ratio_5_s = [0.007,0.008,0.009,0.01]
    ratio_6_s = [0.003,0.004,0.005,0.006]
    ratio_7_s = [0.003,0.004,0.005,0.006]
    ratio_8_s = [0.007,0.008,0.009,0.01]
    """
    g_combinaisons = itertools.product(ratio_s[0], ratio_s[1],
                                       ratio_s[2], ratio_s[3],
                                       ratio_s[4], ratio_s[5],
                                       ratio_s[6], ratio_s[7])

    # if we work with multi-class:
    if len(yProba_s[0].shape) == 2:
            if yProba_s[0].shape[1] == 5:
                for i,subset in enumerate(yProba_s):
                    yProba_s[i] =  preTreatment.multiclass2binary(subset)

    compteur = 0

    for combinaison in g_combinaisons:
        #if compteur%10000==0:
            # print "number of iterations : %i" %compteur
        compteur +=1

        L = list(combinaison)

        yPredicted_s, yPredicted_conca = get_yPredicted_ratio_8(yProba_s, L)

        finals, finalb, s_s, b_s = submission.get_s_b(yPredicted_s,
                                                      yValidation_s,
                                                      weightsValidation_s)

        AMS = hbc.AMS(finals, finalb)
        if AMS > AMS_max:
            AMS_max = AMS
            best_ratio_comb = L

    return AMS_max, best_ratio_comb
Exemplo n.º 5
0
def main():

    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    ratio_train = 0.9

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, valid_s, test_s = tokenizer.extract_data(split= split, \
                                                      normalize= normalize, \
                                                      noise_variance= noise_var, \
                                                      ratio_train= ratio_train)

    yValid_conca = preTreatment.concatenate_vectors(valid_s[2])
    weights_conca = preTreatment.concatenate_vectors(valid_s[3])

    stop = time.clock()
    print ("Extraction time: %i s") %(stop-start)

    print(" ")
    print(" ")

    # Create the elected vectors for each group (best AMS score)
    best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    best_AMS_s = [0. for i in range(8)]
    best_method_s = [0 for i in range(8)]
    best_ratio_s = [0 for i in range(8)]
    best_AMS_1_method = 0.
    best_method = "methode"
    best_ratio = "0."

    ######################
    ### PRE-TREATMENT ####
    ######################
    print("------------------------- Pre-treatment --------------------------")
    ### Average number of signal per subset:
    print("Train subsets signal average:")
    train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2])
    print(" ")
    print("Valid subsets signal average:")
    valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2])

    print(" ")
    print(" ")

    ############
    # ANALYSES #
    ############

    # Dictionnary that will contain all the data for each methods. In the end
    # we'll have a dict of dict
    # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost,
    #                       randomForest, gradientBoosting}
    dMethods ={}

    # NAIVE BAYES:

    kwargs_bayes = {}
    dMethods['naiveBayes'] =  analyse.analyse(train_s, valid_s, 'naiveBayes',
                                              kwargs_bayes)


    kwargs_bayes = {}
    dMethods['naiveBayes'] =  analyse.analyse(train_s, valid_s, 'naiveBayes',
                                              kwargs_bayes)

    # SVM
    
    kwargs_svm ={}
    dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm)
    

    # K NEIGHBORS
    kwargs_tuning_kn = {'n_neighbors': [20,50]}
    dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'kNeighbors',
                                             kwargs_tuning_kn)

    dMethods['kNeighbors'] = combineClassifiers.select_best_classifiers(dTuning, valid_s)
    
    # LDA
    kwargs_lda = {}
    dMethods['lda'] = analyse.analyse(train_s, valid_s, 'lda', kwargs_lda)
    # QDA
    kwargs_qda= {}
    dMethods['qda'] = analyse.analyse(train_s, valid_s, 'qda', kwargs_qda)

    # ADABOOST
    kwargs_ada= {   'n_estimators': 50,
                    'learning_rate': 1.,
                    'algorithm': 'SAMME.R',
                    'random_state':None}
    dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost',
                                           kwargs_ada)

    # RANDOM FOREST:
    kwargs_tuning_rdf = {'n_estimators': [10,50,100]}

    dTuning = tuningModel.parameters_grid_search(train_s, valid_s, 'randomForest',
                                             kwargs_tuning_rdf)

    dMethods['randomForest'] = combineClassifiers.select_best_classifiers(dTuning,
                                                                valid_s)

    # GRADIENT BOOSTING

    kwargs_gradB = {}

    dMethods['gradientBoosting'] = analyse.analyse(train_s, valid_s, 'gradientBoosting', kwargs_gradB)


    kwargs_tuning_gradB = {'loss': ['deviance'], 'learning_rate': [0.1],
                    'n_estimators': [100], 'subsample': [1.0],
                    'min_samples_split': [2], 'min_samples_leaf': [1],
                    'max_depth': [10], 'init': [None], 'random_state': [None],
                    'max_features': [None], 'verbose': [0]}

    dTuning = tuningModel.parameters_grid_search(train_s, valid_s,
                                                'gradientBoosting',
                                                kwargs_tuning_gradB)

    dMethods['gradientBoosting'] = combineClassifiers.select_best_classifiers(
                                                                dTuning,
                                                         valid_s)
    
    print(" ")

    ##################
    # POST-TREATMENT #
    ##################
    print("-------------------- Best overall combination --------------------")

    dCombine = combineClassifiers.select_best_classifiers(dMethods, valid_s)

    print("-------------------------- Thresholding --------------------------")

     # COMBINED CLASSIFIERS:
    f = open("Tests/test_treshold_combined.txt","w")

    yProba_s = dCombine['yProba_s']
    yPredicted_s = dCombine['yPredicted_s']
    #Let's concatenate the vectors
    yProba_conca = preTreatment.concatenate_vectors(yProba_s)
    yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s)

    # Best treshold global
    best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca)
    yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold)

    s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca)
    s *= 10
    b *= 10
    ams = hbc.AMS(s,b)
    if ams > best_AMS_1_method:
        best_AMS_1_method = ams
        best_method = dCombine['method'][i]
        best_ratio = best_treshold

    # Best treshold group by group
    for i in range(8):
        best_treshold = tresholding.best_treshold(yProba_s[i], valid_s[2][i], valid_s[3][i])
        yPredicted_s[i] = tresholding.get_yPredicted_treshold(yProba_s[i], best_treshold)
        s, b = submission.get_s_b(yPredicted_s[i], valid_s[2][i], valid_s[3][i])
        s *= 250000/yPredicted_s[i].shape[0]
        b *= 250000/yPredicted_s[i].shape[0]
        ams = hbc.AMS(s,b)
        if ams > best_AMS_s[i]:
            best_yPredicted_s[i] = yPredicted_s[i]
            best_yProba_s[i] = yProba_s[i]
            best_AMS_s[i] = ams
            best_method_s[i] = dCombine['method'][i]
            best_ratio_s[i] = best_treshold


    # FOR EACH METHOD:
    for method in dMethods:

        yProba_s = dMethods[method]['yProba_s']
        yPredicted_s = dMethods[method]['yPredicted_s']

        #Let's concatenate the vectors
        yProba_conca = preTreatment.concatenate_vectors(yProba_s)
        yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s)

        # Best treshold global
        best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca)
        yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold)

        s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca)
        s *= 10
        b *= 10
        ams = hbc.AMS(s,b)
        if ams > best_AMS_1_method:
            best_AMS_1_method = ams
            best_method = str(method)
            best_ratio = best_treshold

        # Best treshold group by group
        for i in range(8):
            best_treshold = tresholding.best_treshold(yProba_s[i], valid_s[2][i],
                                                      valid_s[3][i])
            yPredicted_s[i] = tresholding.get_yPredicted_treshold(yProba_s[i],
                                                                  best_treshold)
            s, b = submission.get_s_b(yPredicted_s[i], valid_s[2][i],
                                      valid_s[3][i])
            s *= 250000/yPredicted_s[i].shape[0]
            b *= 250000/yPredicted_s[i].shape[0]
            ams = hbc.AMS(s,b)
            if ams > best_AMS_s[i]:
                best_yPredicted_s[i] = yPredicted_s[i]
                best_yProba_s[i] = yProba_s[i]
                best_AMS_s[i] = ams
                best_method_s[i] = str(method)
                best_ratio_s[i] = best_treshold

    # Let's concatenate the 8 vectors which performs the best on each on
    # each of the sub group and tresholding it 
    best_yPredicted_conca = preTreatment.concatenate_vectors(best_yPredicted_s)
    best_treshold_conca = tresholding.best_treshold(best_yPredicted_conca, yValid_conca, weights_conca)
    best_yPredicted_conca_treshold = tresholding.get_yPredicted_treshold(best_yPredicted_conca, best_treshold_conca)

    best_final_s, best_final_b, best_s_s, best_b_s = submission.get_s_b_8(best_yPredicted_s, valid_s[2], valid_s[3])
    best_s_treshold, best_b_treshold = submission.get_s_b(best_yPredicted_conca_treshold, yValid_conca, weights_conca)

    best_final_s *= 10
    best_final_b *= 10
    best_s_treshold *= 10
    best_b_treshold *= 10
    best_AMS = hbc.AMS(best_final_s, best_final_b)
    best_AMS_treshold = hbc.AMS(best_s_treshold, best_b_treshold)


    print "Best AMS using one of the methods : %f" %best_AMS_1_method
    print "    method : %s" %(str(method))
    print "    ratio : %f" %(best_ratio)
    print " "
    print "Best AMS final : %f" %best_AMS
    print "Best AMS final after final tresholding : %f" %best_AMS_treshold
    print "best ratio on the concatenated vector : %f" %best_treshold_conca
    print " "

    for n in range(8):
        print "Best AMS group %i: %f - method %s - ratio %f" \
                %(n, best_AMS_s[n], best_method_s[n], best_ratio_s[n])

    return best_yPredicted_s, valid_s
Exemplo n.º 6
0
        predProba_Valid2_s = xgBoost.predict_proba(predictor_s, valid_RM_s_2[1])

        # Thresholding the predictions:
        predProba_Valid2 = preTreatment.concatenate_vectors(predProba_Valid2_s)
        predLabel5_Valid2 = tresholding.get_yPredicted_ratio(predProba_Valid2,
                                                             best_ratio)

        # Binarize the prediction:
        predLabel_Valid2 = preTreatment.multiclass2binary(predLabel5_Valid2)

        # Concatenate data:
        yValid2 = preTreatment.concatenate_vectors(valid_RM_s_2[2])
        weightsValidation = preTreatment.concatenate_vectors(valid_RM_s_2[3])

        # Estimation the AMS:
        s, b = submission.get_s_b(predLabel_Valid2, yValid2, weightsValidation)
        s *= 250000/predLabel_Valid2.shape[0]
        b *= 250000/predLabel_Valid2.shape[0]
        ams = hbc.AMS(s,b)

        print "Valid_RM_2 - ratio : %f - best ams : %f" %(best_ratio, ams)
        print(" ")

        # Saving the model if it's better:
        if ams > best_ams:
            best_ams = ams
            best_imp_lim = importance_lim
            best_best_ratio = best_ratio
            best_n_removeFeatures = n_removeFeatures
            best_predictor_s = predictor_s
Exemplo n.º 7
0
def train(max_depth, n_rounds):

    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    train_size = 200000
    train_size2 = 25000
    valid_size = 25000
    remove_999 = False

    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, train2_s, valid_s,  test_s = tokenizer.extract_data(split= split, \
                                             normalize= normalize, \
                                             remove_999 = remove_999, \
                                             noise_variance= noise_var, \
                                             n_classes = "multiclass", \
                                             train_size = train_size, \
                                             train_size2 = train_size2, \
                                             valid_size = valid_size)

    
    #RANDOM FOREST:
    #kwargs_grad = {}
    #kwargs_rdf = {'n_estimators': 100}
    print "Training on the train set ..."
    #predictor_s = randomForest.train_classifier(train_s[1], train_s[2], kwargs_rdf)

    #XGBOOST
    kwargs_xgb = {'bst_parameters': \
                {'booster_type': 0, \
                     #'objective': 'binary:logitraw',
                     'objective': 'multi:softprob', 'num_class': 5,
                     'bst:eta': 0.1, # the bigger the more conservative
                     'bst:subsample': 1, # prevent over fitting if <1
                     'bst:max_depth': max_depth, 'eval_metric': 'auc', 'silent': 1,
                     'nthread': 8 }, \
                'n_rounds': n_rounds}

    predictor_s = xgBoost.train_classifier(train_s[1], train_s[2], train_s[3], 550000, kwargs_xgb)
    
    #TEST / SUBMISSION
    """
    yProbaTest_s = []
    yProbaTestBinary_s = []

    print "Classifying the test set..."
    for i in range(8):
        yProbaTest = xgBoost.predict_proba(predictor_s[i], test_s[1][i])
        yProbaTest_s.append(yProbaTest)
    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTestBinary_s.append(np.zeros(yProbaTest_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTest_s[i].shape[0]):
            yProbaTestBinary_s[i][j] = 1 - yProbaTest_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTestBinary = preTreatment.concatenate_vectors(yProbaTestBinary_s)
    IDs = preTreatment.concatenate_vectors(test_s[0])


    yProbaTestBinaryRanked = submission.rank_signals(yProbaTestBinary)
    
    yPredictedTest = tresholding.get_yPredicted_ratio(yProbaTestBinary, 0.15)

    s = submission.print_submission(IDs, yProbaTestBinaryRanked, yPredictedTest, "newAMSmesure") 

    
    """
    # TRAIN AND VALID
    
    yPredictedTrain2_s = []
    yProbaTrain2_s = []
    yProbaTrain2Binary_s = []
    yPredictedValid_s = []
    yProbaValid_s = []
    yProbaValidBinary_s = []

    print "Classifying the train2 set..."
    for i in range(8):
        yProbaTrain2 = xgBoost.predict_proba(predictor_s[i], train2_s[1][i])
        yProbaTrain2_s.append(yProbaTrain2)
    print "Classifying the valid set..."
    for i in range(8):
        yProbaValid = xgBoost.predict_proba(predictor_s[i], valid_s[1][i])
        yProbaValid_s.append(yProbaValid)

    print "Making the binary proba vector..."
    for i in range(8):
        yProbaTrain2Binary_s.append(np.zeros(yProbaTrain2_s[i].shape[0]))
        yProbaValidBinary_s.append(np.zeros(yProbaValid_s[i].shape[0]))
    for i in range(8):
        for j in range(yProbaTrain2_s[i].shape[0]):
            yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0]
        for j in range(yProbaValid_s[i].shape[0]):
            yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0]

    print "Concatenating the vectors..."
    yProbaTrain2Binary = preTreatment.concatenate_vectors(yProbaTrain2Binary_s)
    yProbaValidBinary = preTreatment.concatenate_vectors(yProbaValidBinary_s)
    yTrain2 = preTreatment.concatenate_vectors(train2_s[2])
    yValid = preTreatment.concatenate_vectors(valid_s[2])
    weightsTrain2 = preTreatment.concatenate_vectors(train2_s[3])
    weightsValid = preTreatment.concatenate_vectors(valid_s[3])

    print "Putting all the real labels to 1"
    yTrain2 = preTreatment.multiclass2binary(yTrain2)
    yValid = preTreatment.multiclass2binary(yValid)

    print "Getting the best ratios..."
    best_ams_train2_global, best_ratio_global = tresholding.best_ratio(yProbaTrain2Binary, yTrain2, weightsTrain2)
    #best_ams_train2_combinaison, best_ratio_combinaison = tresholding.best_ratio_combinaison_global(yProbaTrain2Binary_s, train2_s[2], train2_s[3], 1)

    yPredictedValid = tresholding.get_yPredicted_ratio(yProbaValidBinary, 0.15)
    yPredictedValid_best_ratio_global = tresholding.get_yPredicted_ratio(yProbaValidBinary, best_ratio_global)
    #yPredictedValid_best_ratio_combinaison_s, yPredictedValid_best_ratio_combinaison = tresholding.get_yPredicted_ratio_8(yProbaTrain2Binary_s, best_ratio_combinaison)

    #Let's compute the predicted AMS
    s, b = submission.get_s_b(yPredictedValid, yValid, weightsValid)
    AMS = hbc.AMS(s,b)
    #s_best_ratio_combinaison, b_best_ratio_combinaison = submission.get_s_b(yPredictedValid_best_ratio_combinaison, yValid, weightsValid)
    #AMS_best_ratio_combinaison = hbc.AMS(s_best_ratio_combinaison, b_best_ratio_combinaison)
    s_best_ratio_global, b_best_ratio_global = submission.get_s_b(yPredictedValid_best_ratio_global, yValid, weightsValid)
    AMS_best_ratio_global = hbc.AMS(s_best_ratio_global, b_best_ratio_global)

    print "AMS 0.15 = %f" %AMS
    print " "
    #print "AMS best ratio combi= %f" %AMS_best_ratio_combinaison
    #print "best AMS train2 ratio combinaison= %f" %best_ams_train2_combinaison
    #print "best ratio combinaison train 2 = %s" %str(best_ratio_combinaison)
    print " "
    print "best AMS valid ratio global= %f" %AMS_best_ratio_global
    print "best AMS train2 ratio global= %f" %best_ams_train2_global
    print "best ratio global train2 = %f" %best_ratio_global
 

    return AMS
Exemplo n.º 8
0
def analyse(train_s, train2_s, valid_s, method_name, kwargs={}):
    """
    methode name = string, name of the method (eg :"naiveBayes")
    kwargs = dictionnary of the paraters of the method
    train_s = training set for the classifier(s)
    train2_s = training set for the meta parameters (eg : the best treshold)
    valid_s : validation set
    None of the set must be empty !
    """
    # Prediction on the validation set:
    print("------------------- Analyse: %s -----------------------") \
                        %(method_name)

    classifier_s = eval(method_name).train_classifier(train_s[1], train_s[2],
                                                      kwargs)

    yProbaTrain2_s = eval(method_name).predict_proba(classifier_s, train2_s[1])
    yProbaValid_s = eval(method_name).predict_proba(classifier_s, valid_s[1])

    # Convert the validations vectors four 's' classes into one single s
    # classe
    if type(valid_s[2]) == list:
        for i in range(len(valid_s[2])):
            for j in range(valid_s[2][i].shape[0]):
                if valid_s[2][i][j] >=1:
                    valid_s[2][i][j] = 1

    # Convert the train2 vectors four 's' classes into one single s
    # classe
    if type(train2_s[2]) == list:
        for i in range(len(train2_s[2])):
            for j in range(train2_s[2][i].shape[0]):
                if train2_s[2][i][j] >=1:
                    train2_s[2][i][j] = 1

    # Let's define the vectors of probabilities of being 's'
    # Train2 set
    if type(yProbaTrain2_s) == list:
        yProbaTrain2Binary_s = []
        for i in range(8):
            yProbaTrain2Binary_s.append(np.zeros(len(yProbaTrain2_s[i][:,1])))
        for i in range(8):
            for j in range(len(yProbaTrain2_s[i][:,1])):
                yProbaTrain2Binary_s[i][j] = 1 - yProbaTrain2_s[i][j][0]
    else:
        yProbaTrain2Binary_s = np.zeros(len(yProbaTrain2_s[i][:,1]))
        for j in range(len(yProbaTrain2_s[i][:,1])):
            yProbaTrain2Binary_s[j] = 1 - yProbaTrain2_s[j][0]

    # Validation set
    if type(yProbaValid_s) == list:
        yProbaValidBinary_s = []
        for i in range(8):
            yProbaValidBinary_s.append(np.zeros(len(yProbaValid_s[i][:,1])))
        for i in range(8):
            for j in range(len(yProbaValid_s[i][:,1])):
                yProbaValidBinary_s[i][j] = 1 - yProbaValid_s[i][j][0]
    else:
        yProbaValidBinary_s = np.zeros(len(yProbaValid_s[i][:,1]))
        for j in range(len(yProbaValid_s[i][:,1])):
            yProbaValidBinary_s[j] = 1 - yProbaValid_s[j][0]

    # If we work with lists, let's get the concatenated vectors:
    # TRAIN SET
    if type(train_s[3]) ==list:
        weightsTrain_conca = preTreatment.concatenate_vectors(train_s[3])
    else:
        weightsTrain_conca = train_s[3]
    # VALID SET
    # Validation Vectors
    if type(valid_s[2]) == list:
        yValid_conca = preTreatment.concatenate_vectors(valid_s[2])
    else:
        yValid_conca = valid_s[2]
    # Weights Vectors
    if type(valid_s[3]) == list:
        weightsValid_conca = preTreatment.concatenate_vectors(valid_s[3])
    else:
        weightsValid_conca = valid_s[3]
    # Binary Proba Vectors
    if type(yProbaValidBinary_s) == list:
        yProbaValidBinary_conca = preTreatment.concatenate_vectors(
                                                              yProbaValidBinary_s)
    else:
        yProbaValidBinary_conca = yProbaValidBinary_s
    # All Proba Vectors
    if type(yProbaValid_s) == list:
        yProbaValid_conca = preTreatment.concatenate_vectors(yProbaValid_s)
    else:
        yProbaValid_conca = yProbaValid_s

    #TRAIN2 SET
    # Validation Vectors
    if type(train2_s[2]) == list:
        yTrain2_conca = preTreatment.concatenate_vectors(train2_s[2])
    else:
        yTrain2_conca = train2_s[2]
    # Weights Vectors
    if type(train2_s[3]) == list:
        weightsTrain2_conca = preTreatment.concatenate_vectors(train2_s[3])
    else:
        weightsTrain2_conca = train2_s[3]
    # Binary Proba Vectors
    if type(yProbaTrain2Binary_s) == list:
        yProbaTrain2Binary_conca = preTreatment.concatenate_vectors(
                                                            yProbaTrain2Binary_s)
    else:
        yProbaTrain2Binary_conca = yProbaTrain2Binary_s
    # All Proba Vectors
    if type(yProbaTrain2_s) == list:
        yProbaTrain2_conca = preTreatment.concatenate_vectors(yProbaTrain2_s)
    else:
        yProbaTrain2_conca = yProbaTrain2_s

    # Let's rebalance the weight so their sum is equal to the total sum
    # of the train set
    sumWeightsTotal = sum(weightsTrain_conca)+sum(weightsTrain2_conca)+sum(weightsValid_conca)
    weightsTrain2_conca *= sumWeightsTotal/sum(weightsTrain2_conca)
    weightsValid_conca *= sumWeightsTotal/sum(weightsValid_conca)
    for i in range(8):
        train2_s[3][i] *= sumWeightsTotal/sum(weightsTrain2_conca)
        valid_s[3][i] *= sumWeightsTotal/sum(weightsValid_conca)

    # Let's get the best global treshold on the train2 set
    AMS_treshold_train2, best_treshold_global = tresholding.\
                            best_treshold(yProbaTrain2Binary_conca,
                                          yTrain2_conca,
                                          weightsTrain2_conca)

    yPredictedValid_conca_treshold = tresholding.get_yPredicted_treshold(
                                                        yProbaValidBinary_conca,
                                                        best_treshold_global)

    # Let's get the best ratio treshold on the train2 set
    AMS_ratio_global_train2, best_ratio_global = tresholding.\
                                    best_ratio(yProbaTrain2Binary_conca,
                                               yTrain2_conca,
                                               weightsTrain2_conca)

    yPredictedValid_conca_ratio_global = tresholding.get_yPredicted_ratio(
                                                        yProbaValidBinary_conca,
                                                        best_ratio_global)
    # Let's get the best ratios combinaison
    if type(train_s[2]) == list:
        AMS_ratio_combinaison_train2, best_ratio_combinaison = tresholding.\
                                best_ratio_combinaison_global(
                                                           yProbaTrain2Binary_s,
                                                           train2_s[2],
                                                           train2_s[3],
                                                           30)

        yPredictedValid_ratio_comb_s, yPredictedValid_conca_ratio_combinaison =\
                                tresholding.get_yPredicted_ratio_8(
                                                    yProbaValidBinary_s,
                                                    best_ratio_combinaison)

    # Let's compute the final s and b for each method
    s_treshold, b_treshold = submission.get_s_b(
                                                yPredictedValid_conca_treshold,
                                                yValid_conca,
                                                weightsValid_conca)
    s_ratio_global, b_ratio_global = submission.get_s_b(
                                            yPredictedValid_conca_ratio_global,
                                            yValid_conca,
                                            weightsValid_conca)
    if type(train_s[2]) == list:
        s_ratio_combinaison, b_ratio_combinaison = submission.get_s_b(
                                        yPredictedValid_conca_ratio_combinaison,
                                        yValid_conca,
                                        weightsValid_conca)

    # AMS final:
    AMS_treshold_valid = hbc.AMS(s_treshold, b_treshold)
    AMS_ratio_global_valid = hbc.AMS(s_ratio_global, b_ratio_global)
    if type(train_s[2]) == list:
        AMS_ratio_combinaison_valid = hbc.AMS(s_ratio_combinaison,
                                              b_ratio_combinaison)

    """
    #AMS by group:
    if type(train_s[2]) == list:
        AMS_s = []
        for i, (s,b) in enumerate(zip(s_s, b_s)):
            s *= 250000/yPredictedValid_s[i].shape[0]
            b *= 250000/yPredictedValid_s[i].shape[0]
            score = hbc.AMS(s,b)
            AMS_s.append(score)
    """
    # Classification error:
    classif_succ_treshold = eval(method_name).get_classification_error(
                                                   yPredictedValid_conca_treshold,
                                                   yValid_conca,
                                                   normalize= True)

    classif_succ_ratio_global = eval(method_name).get_classification_error(
                                               yPredictedValid_conca_ratio_global,
                                               yValid_conca,
                                               normalize= True)

    classif_succ_ratio_combinaison = eval(method_name).get_classification_error(
                                          yPredictedValid_conca_ratio_combinaison,
                                          yValid_conca,
                                          normalize= True)

    # Numerical score:
    """
    if type(yProbaValid_s) == list:
        sum_s_treshold_s = []
        sum_b_treshold_s = []
        sum_s_ratio_global_s = []
        sum_b_ratio_global_s = []
        sum_s_ratio_combinaison_s = []
        sum_b_ratio_combinaison_s = []
        for i in range(len(yPredictedValid_s)):
            # treshold
            sum_s_treshold, sum_b_treshold = submission.get_numerical_score(yPredictedValid_conca_treshold_s[i],
                                                          valid_s[2][i])
            sum_s_treshold_s.append(sum_s)
            sum_b_treshold_s.append(sum_b)
            # ratio global
            sum_s_ratio_global, sum_b_ratio_global = submission.get_numerical_score(yPredictedValid_conca_ratio_global_s[i],
                                                          valid_s[2][i])
            sum_s_ratio_global_s.append(sum_s_ratio_global)
            sum_b_ratio_global_s.append(sum_b_ratio_global)
            # ratio combinaison
            sum_s_ratio_combinaison, sum_b_ratio_combinaison = submission.get_numerical_score(yPredictedValid_conca_ratio_combinaison_s[i],
                                                          valid_s[2][i])
            sum_s_ratio_combinaison_s.append(sum_s_ratio_combinaison)
            sum_b_ratio_combinaison_s.append(sum_b_ratio_combinaison)




    else:
        sum_s, sum_b = submission.get_numerical_score(yPredictedValid_s,
                                                           valid_s[2])
    """
    d = {'classifier_s':classifier_s,
         'yPredictedValid_conca_treshold': yPredictedValid_conca_treshold,
         'yPredictedValid_conca_ratio_global' : \
                 yPredictedValid_conca_ratio_global,
         'yProbaTrain2_s': yProbaTrain2_s,
         'yProbaTrain2Binary_s': yProbaTrain2Binary_s,
         'yProbaTrain2_conca': yProbaTrain2_conca,
         'yProbaTrain2Binary_conca': yProbaTrain2Binary_conca,
         'yProbaValid_s':yProbaValid_s,
         'yProbaValidBinary_s':yProbaValidBinary_s,
         'yProbaValid_conca':yProbaValid_conca,
         'yProbaValidBinary_conca': yProbaValidBinary_conca,
         'AMS_treshold_train2':AMS_treshold_train2,
         'AMS_ratio_global_train2':AMS_ratio_global_train2,
         'AMS_treshold_valid':AMS_treshold_valid,
         'AMS_ratio_global_valid':AMS_ratio_global_valid,
         'best_treshold_global' : best_treshold_global,
         'best_ratio_global':best_ratio_global,
         'classif_succ_treshold': classif_succ_treshold,
         'classif_succ_ratio_global': classif_succ_ratio_global,
         'method': method_name,
         'parameters': kwargs}

    if type(train_s[2])==list:
        d['yPredictedValid_conca_ratio_combinaison'] = yPredictedValid_conca_ratio_combinaison
        d['AMS_ratio_combinaison_train2'] = AMS_ratio_combinaison_train2
        d['AMS_ratio_combinaison_valid'] = AMS_ratio_combinaison_valid,
        d['best_ratio_combinaison'] = best_ratio_combinaison,
        d['classif_succ_ratio_combinaison'] = classif_succ_ratio_combinaison

    return d
Exemplo n.º 9
0
def main():
    ###############
    ### IMPORT ####
    ###############
    # Importation parameters:
    split= True
    normalize = True
    noise_var = 0.
    n_classes = "binary"
    train_size = 200000
    train_size2 = 25000
    valid_size = 25000


    # Import the training data:
    print("Extracting the data sets...")
    start = time.clock()
    train_s, train2_s, valid_s, test_s = tokenizer.extract_data(split= split,
                                                      normalize= normalize,
                                                      noise_variance= noise_var,
                                                      n_classes = n_classes,
                                                      train_size = train_size,
                                                      train_size2 = train_size2,
                                                      valid_size = valid_size)

    # Remerging the y and weights of the validation if necessary:
    if type(valid_s[2]) == list:
        yValid_conca = preTreatment.concatenate_vectors(valid_s[2])
        weights_conca = preTreatment.concatenate_vectors(valid_s[3])

    stop = time.clock()
    print ("Extraction time: %i s") %(stop-start)

    print(" ")
    print(" ")

    ######################
    ### PRE-TREATMENT ####
    ######################
    print("------------------------- Pre-treatment --------------------------")
    ### Average number of signal per subset:
    print("Train subsets signal average:")
    train_s_average = preTreatment.ratio_sig_per_dataset(train_s[2])
    print(" ")
    print("Valid subsets signal average:")
    valid_s_average = preTreatment.ratio_sig_per_dataset(valid_s[2])

    print(" ")
    print(" ")

    ############
    # ANALYSES #
    ############

    # Dictionnary that will contain all the data for each methods. In the end
    # we'll have a dict of dict
    # Keys of the methods : {naiveBayes, svm, kNeighbors, lda, qda, adaBoost,
    #                       randomForest}
    dMethods ={}

    # NAIVE BAYES:
    kwargs_bayes = {}
    dMethods['naiveBayes'] =  analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'naiveBayes',
                                              kwargs = kwargs_bayes)
    # SVM
    """
    kwargs_svm ={}
    dMethods['svm'] = analyse.analyse(train_s, valid_s,'svm', kwargs_svm)
    """
    """
    # K NEIGHBORS
    kwargs_kn = {'n_neighbors':50}
    dMethods['kNeighbors'] = analyse.analyse(train_s, valid_s, 'kNeighbors',
                                             kwargs_kn)
    """
    # LDA
    kwargs_lda = {}
    dMethods['lda'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'lda',
                                              kwargs = kwargs_lda)

    # QDA
    kwargs_qda= {}
    dMethods['qda'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'qda',
                                              kwargs = kwargs_qda)
    """
    # ADABOOST
    kwargs_ada= {   'n_estimators': 50,
                    'learning_rate': 1.,
                    'algorithm': 'SAMME.R',
                    'random_state':None}
    dMethods['adaBoost'] = analyse.analyse(train_s, valid_s, 'adaBoost',
                                           kwargs_ada)
    """
    # RANDOM FOREST:
    kwargs_randomForest= {'n_estimators': 10}
    dMethods['randomForest'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'randomForest',
                                              kwargs = kwargs_randomForest)

    # RANDOM FOREST 2:
    kwargs_randomForest= {'n_estimators': 100}
    dMethods['randomForest2'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'randomForest',
                                              kwargs = kwargs_randomForest)
    """
    # ADABOOST2
    kwargs_ada= {   'n_estimators': 100,
                    'learning_rate': .5,
                    'algorithm': 'SAMME.R',
                    'random_state':None}
    dMethods['adaBoost2'] = analyse.analyse(train_s, valid_s, 'adaBoost',
                                           kwargs_ada)

    # RANDOM FOREST 3:
    kwargs_randomForest= {'n_estimators': 100}
    dMethods['randomForest3'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'randomForest',
                                              kwargs = kwargs_randomForest)

    # RANDOM FOREST 4:
    kwargs_randomForest= {'n_estimators': 100}
    dMethods['randomForest4'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'randomForest',
                                              kwargs = kwargs_randomForest)

    # RANDOM FOREST 5:
    kwargs_randomForest= {'n_estimators': 100}
    dMethods['randomForest5'] = analyse.analyse(train_s= train_s, train2_s= train2_s,
                                              valid_s= valid_s,
                                              method_name = 'randomForest',
                                              kwargs = kwargs_randomForest)

    # GRADIENT BOOSTING:
    kwargs_gradB = {'loss': 'deviance', 'learning_rate': 0.1,
                    'n_estimators': 100, 'subsample': 1.0,
                    'min_samples_split': 2, 'min_samples_leaf': 200,
                    'max_depth': 10, 'init': None, 'random_state': None,
                    'max_features': None, 'verbose': 0}

    dMethods['gradientBoosting'] = analyse.analyse(train_s, valid_s,
                                                'gradientBoosting', kwargs_gradB)
    """
    print(" ")

    ##################
    # POST-TREATMENT #
    ##################
    print("------------------------ Feaure importance: -----------------------")

    if type(dMethods['randomForest2']['predictor_s']) == list:
        for i,predictor_s in enumerate(dMethods['randomForest2']['predictor_s']):
            print "Subset %i:" %i
            print predictor_s.feature_importances_
    else:
        print "Dataset: "
        print dMethods['randomForest2']['predictor_s'].feature_importances_


    print("------------------------ On-top predictor -----------------------")
    # Classifiers to be ignored:
    #ignore = ['randomForest2', 'randomForest']
    ignore = []
    clf_onTop = 'randomForest'
    parameters = {}#{'C': 0.5, 'kernel': 'rbf', 'degree': 3, 'gamma': 0.0,
                 # 'coef0': 0.0, 'shrinking':True, 'probability':True,
                 # 'tol': 0.001, 'cache_size': 200, 'class_weight': None}


    print ("We will use an 'on-top' predictor on %i classifiers using a %s.") \
            %(len(dMethods.keys())-len(ignore), clf_onTop)

    final_prediction_s, dOnTop = onTopClassifier.SL_classification(dMethods,
                                        valid_s, train_s,
                                        ignore = ignore,
                                        method= clf_onTop, parameters= parameters)

    print("-------------------------- Tresholding -------------------------")
    ### ON THE 'ON-TOP' CLASSIFIER:
    # Create the elected vectors for each group (best AMS score)
    OT_best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    OT_best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    OT_best_AMS_s = [0. for i in range(8)]
    OT_best_method_s = [0 for i in range(8)]
    OT_best_ratio_s = [0 for i in range(8)]
    OT_best_sum_s_s = [0 for i in range(8)]
    OT_best_sum_b_s =  [0 for i in range(8)]
    OT_best_method = "On-top"

    OT_yProba_s = dOnTop['yProba_s']
    OT_yPredicted_s = dOnTop['yPredicted_s']

    #Let's concatenate the vectors
    OT_yProba_conca = preTreatment.concatenate_vectors(OT_yProba_s)
    OT_yPredicted_conca = preTreatment.concatenate_vectors(OT_yPredicted_s)

    # Best treshold global
    OT_best_ratio = tresholding.best_treshold(OT_yProba_conca, yValid_conca,
                                                 weights_conca)
    OT_yPredicted_treshold = tresholding.get_yPredicted_treshold(OT_yProba_conca,
                                                                 OT_best_ratio)

    OT_s, OT_b = submission.get_s_b(OT_yPredicted_treshold, yValid_conca,
                                    weights_conca)
    OT_s *= 10
    OT_b *= 10
    OT_best_AMS = hbc.AMS(OT_s,OT_b)


    # COMPARISON BEST TRESHOLD IN DMETHOD
    # FOR EACH METHOD:
    best_yPredicted_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    best_yProba_s = [np.zeros(valid_s[2][i].shape[0]) for i in range(8)]
    best_AMS_s = [0. for i in range(8)]
    best_method_s = [0 for i in range(8)]
    best_ratio_s = [0 for i in range(8)]
    best_AMS_1_method = 0.
    best_method = "methode"
    best_ratio = "0."


    for method in dMethods:

        yProba_s = dMethods[method]['yProba_s']
        yPredicted_s = dMethods[method]['yPredicted_s']

        #Let's concatenate the vectors
        yProba_conca = preTreatment.concatenate_vectors(yProba_s)
        yPredicted_conca = preTreatment.concatenate_vectors(yPredicted_s)

        # Best treshold global
        best_treshold = tresholding.best_treshold(yProba_conca, yValid_conca, weights_conca)
        yPredicted_treshold = tresholding.get_yPredicted_treshold(yProba_conca, best_treshold)

        s, b = submission.get_s_b(yPredicted_treshold, yValid_conca, weights_conca)
        s *= 10
        b *= 10
        ams = hbc.AMS(s,b)
        if ams > best_AMS_1_method:
            best_AMS_1_method = ams
            best_method = str(method)
            best_ratio = best_treshold


    # Let's concatenate the 8 vectors which performs the best on each on
    # each of the sub group and tresholding it
    best_yPredicted_conca = preTreatment.concatenate_vectors(best_yPredicted_s)
    best_treshold_conca = tresholding.best_treshold(best_yPredicted_conca, yValid_conca, weights_conca)
    best_yPredicted_conca_treshold = tresholding.get_yPredicted_treshold(best_yPredicted_conca, best_treshold_conca)

    best_final_s, best_final_b, best_s_s, best_b_s = submission.get_s_b_8(best_yPredicted_s, valid_s[2], valid_s[3])
    best_s_treshold, best_b_treshold = submission.get_s_b(best_yPredicted_conca_treshold, yValid_conca, weights_conca)

    best_final_s *= 10
    best_final_b *= 10
    best_s_treshold *= 10
    best_b_treshold *= 10
    best_AMS = hbc.AMS(best_final_s, best_final_b)
    best_AMS_treshold = hbc.AMS(best_s_treshold, best_b_treshold)


    print "Best AMS using one of the methods : %f" %best_AMS_1_method
    print "    method : %s" %(str(method))
    print "    ratio : %f" %(best_ratio)
    print " "
    print "Best AMS concatenate: %f" %best_AMS
    print "Best AMS concatenate  after final tresholding : %f" %best_AMS_treshold
    print "best ratio on the concatenated vector : %f" %best_treshold_conca
    print " "
    print "Best AMS on-top : %f" %OT_best_AMS
    print "Best ratio on the concatenated vector : %f" %OT_best_ratio
    print " "



    """
    # Best treshold group by group
    for i in range(8):
        OT_best_treshold_s = tresholding.best_treshold(OT_yProba_s[i],
                                                       valid_s[2][i],
                                                       valid_s[3][i])

        OT_yPredicted_s[i] = tresholding.get_yPredicted_treshold(OT_yProba_s[i],
                                                              OT_best_treshold_s)

        s, b = submission.get_s_b(OT_yPredicted_s[i], valid_s[2][i],
                                  valid_s[3][i])

        s *= 250000/yPredicted_s[i].shape[0]
        b *= 250000/yPredicted_s[i].shape[0]

        ams = hbc.AMS(s,b)
        if ams > best_AMS_s[i]:
            best_yPredicted_s[i] = yPredicted_s[i]
            best_yProba_s[i] = yProba_s[i]
            best_AMS_s[i] = ams
            best_method_s[i] = dOnTop['method']
            best_ratio_s[i] = best_treshold
            best_sum_s_s[i] = s
            best_sum_b_s[i] =  b

    for n in range(8):
        print "Best AMS group %i: %f - method %s - ratio %f" \
                %(n, best_AMS_s[n], best_method_s[n], best_ratio_s[n])

    print "Best AMS : %f" %best_AMS_1_method
    print "    ratio : %f" %(best_ratio)
    print " "
    """



    """
    ##############
    # SUBMISSION #
    ##############
    print("-------------------------- Submission ---------------------------")

    # Prediction on the test set:
    # method used for the submission
    # TODO : Verifier que le nom de la method a bien la bonne forme(
    # creer une liste de noms de methodes)

    #method = "randomForest"

    #test_prediction_s, test_proba_s = eval(method).get_test_prediction(
    #                                            dMethods[method]['predictor_s'],
    #                                            test_s[1])

    test_prediction_s, test_proba_s = onTopClassifier.get_SL_test_prediction(
                                                dMethods, dOnTop, test_s[1])


    print("Test subsets signal average:")
    test_s_average = preTreatment.ratio_sig_per_dataset(test_prediction_s)
    print(" ")

    #RankOrder = np.arange(1,550001)

    if type(test_prediction_s) == list:
        test_prediction_s = np.concatenate(test_prediction_s)
        test_proba_s = np.concatenate(test_proba_s)
        RankOrder = onTopClassifier.rank_signals(test_proba_s)
        ID = np.concatenate(test_s[0])
    else:
        ID = test_s[0]

    # Create a submission file:
    sub = submission.print_submission(ID, RankOrder , test_prediction_s)

    return sub
    """
    return 0