示例#1
0
文件: main.py 项目: world4jason/MITWS
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins,
                    cost, gamma, settings):
    dataset_name = settings.dataset_name
    ##read the 3 sets of data: train, validation and test
    if settings.dataset_type == "dense":  ##dense type from NIPS
        data_train = prepare_data.import_nips_dense(settings.file_train[0],
                                                    settings.file_train[1])
        data_valid = prepare_data.import_nips_dense(settings.file_valid[0],
                                                    settings.file_valid[1])
        data_test = prepare_data.import_nips_dense(settings.file_test, "")
    elif settings.dataset_type == "sparse_binary":  ##sparse_binary type from NIPS
        data_train = prepare_data.import_nips_sparse_binary(
            settings.file_train[0], settings.file_train[1],
            settings.number_features)
        data_valid = prepare_data.import_nips_sparse_binary(
            settings.file_valid[0], settings.file_valid[1],
            settings.number_features)
        data_test = prepare_data.import_nips_sparse_binary(
            settings.file_test[0], "", settings.number_features)
    elif settings.dataset_type == "sparse_integer":  ##sparse_integer type from NIPS
        data_train = prepare_data.import_nips_sparse_integer(
            settings.file_train[0], settings.file_train[1],
            settings.number_features)
        data_valid = prepare_data.import_nips_sparse_integer(
            settings.file_valid[0], settings.file_valid[1],
            settings.number_features)
        data_test = prepare_data.import_nips_sparse_integer(
            settings.file_test[0], "", settings.number_features)

    ##normalize the 3 sets with the normalization parameters used during the feature selection process
    data_train = prepare_data.apply_normalization(data_train, params_norm)
    data_valid = prepare_data.apply_normalization(data_valid, params_norm)
    data_test = prepare_data.apply_normalization(data_test, params_norm)

    validation_results = {}  ##save results of validation

    ##create variables to test and find the accuracy of the train and valid sets
    aux_data_1 = data + data_train
    folds_1 = [(range(0,
                      len(data)), range(len(data),
                                        len(data) + len(data_train)))]

    aux_data_2 = data + data_valid
    folds_2 = [(range(0,
                      len(data)), range(len(data),
                                        len(data) + len(data_valid)))]
    for i in range(0, len(best_subsets)
                   ):  ##test every subset and check which generalizes best
        acc_train = classification_part.classify(folds_1, aux_data_1,
                                                 best_subsets[i][0], cost,
                                                 gamma, settings.svm_kernel)
        acc_valid = classification_part.classify(folds_2, aux_data_2,
                                                 best_subsets[i][0], cost,
                                                 gamma, settings.svm_kernel)
        validation_results[i] = (acc_train, acc_valid)

    ##selection the subset that was able to obtain the best score for both sets... this could be changed
    top_score_1 = 0.0
    top_score_2 = 0.0
    top_subset = ""
    top_score = 0.0
    for i in validation_results:
        print best_subsets[i][0], validation_results[i]
        score_1 = validation_results[i][0]
        score_2 = validation_results[i][1]
        if score_1 + score_2 > top_score:
            top_score = score_1 + score_2
            top_score_1 = score_1
            top_score_2 = score_2
            top_subset = best_subsets[i][0]
        elif score_1 + score_2 == top_score:  ##case where they have same percentage
            if abs(score_1 - score_2) < abs(top_score_1 - top_score_2):
                top_score = score_1 + score_2
                top_score_1 = score_1
                top_score_2 = score_2
                top_subset = best_subsets[i][0]

    print top_score_1, top_score_2, "selected subset:", top_subset

    ##create the nips file for each set
    classify_data(data, top_subset, dataset_name + "_train", data_train, cost,
                  gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost,
                  gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_test", data_test, cost,
                  gamma, settings.svm_kernel)

    ##write the selected features to the file using the MI score as sort criterion
    top_subset = order_importance_of_features(top_subset, mi_scores)
    f_fts = open("results/" + dataset_name + ".feat", "a")

    for ft in top_subset:
        f_fts.write(str(int(ft) + 1) + "\n")
    f_fts.close()
示例#2
0
文件: main.py 项目: JSilva90/MITWS
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins, cost, gamma, settings):
    dataset_name = settings.dataset_name
    ##read the 3 sets of data: train, validation and test
    if settings.dataset_type == "dense": ##dense type from NIPS
        data_train = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1])
        data_valid = prepare_data.import_nips_dense(settings.file_valid[0], settings.file_valid[1])
        data_test = prepare_data.import_nips_dense(settings.file_test, "")
    elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS
        data_train = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features)
        data_valid = prepare_data.import_nips_sparse_binary(settings.file_valid[0], settings.file_valid[1], settings.number_features)
        data_test = prepare_data.import_nips_sparse_binary(settings.file_test[0], "", settings.number_features)
    elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS
        data_train = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features)
        data_valid = prepare_data.import_nips_sparse_integer(settings.file_valid[0], settings.file_valid[1], settings.number_features)
        data_test = prepare_data.import_nips_sparse_integer(settings.file_test[0], "", settings.number_features)
    
    ##normalize the 3 sets with the normalization parameters used during the feature selection process
    data_train = prepare_data.apply_normalization(data_train, params_norm)
    data_valid = prepare_data.apply_normalization(data_valid, params_norm)
    data_test = prepare_data.apply_normalization(data_test, params_norm)
    
    
    validation_results = {} ##save results of validation
    
    ##create variables to test and find the accuracy of the train and valid sets
    aux_data_1 = data + data_train
    folds_1 = [(range(0,len(data)), range(len(data), len(data) + len(data_train)))]
    
    aux_data_2 = data + data_valid
    folds_2 = [(range(0,len(data)), range(len(data), len(data) + len(data_valid)))]
    for i in range(0, len(best_subsets)): ##test every subset and check which generalizes best    
        acc_train = classification_part.classify(folds_1, aux_data_1, best_subsets[i][0], cost, gamma, settings.svm_kernel)
        acc_valid = classification_part.classify(folds_2, aux_data_2, best_subsets[i][0], cost, gamma, settings.svm_kernel)
        validation_results[i] = (acc_train, acc_valid)
    
    ##selection the subset that was able to obtain the best score for both sets... this could be changed
    top_score_1 = 0.0
    top_score_2 = 0.0
    top_subset = ""
    top_score = 0.0
    for i in validation_results:
        print best_subsets[i][0], validation_results[i]
        score_1 = validation_results[i][0]
        score_2 = validation_results[i][1]
        if score_1 + score_2 > top_score:
            top_score = score_1 + score_2
            top_score_1 = score_1
            top_score_2 = score_2
            top_subset = best_subsets[i][0]
        elif score_1 + score_2 == top_score: ##case where they have same percentage
            if abs(score_1 - score_2) < abs(top_score_1 - top_score_2):
                top_score = score_1 + score_2
                top_score_1 = score_1
                top_score_2 = score_2
                top_subset = best_subsets[i][0]
                
    
    print top_score_1, top_score_2 , "selected subset:", top_subset
    
    ##create the nips file for each set
    classify_data(data, top_subset, dataset_name + "_train", data_train, cost, gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost, gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_test", data_test, cost, gamma, settings.svm_kernel)
    
    ##write the selected features to the file using the MI score as sort criterion
    top_subset = order_importance_of_features(top_subset, mi_scores)
    f_fts = open("results/" + dataset_name + ".feat", "a")
    
    for ft in top_subset:
        f_fts.write(str(int(ft)+1) + "\n")
    f_fts.close()
示例#3
0
文件: main.py 项目: world4jason/MITWS
def main():
    if __name__ == '__main__':
        start_t = time.time()

        ##read settings from the xml file
        settings = read_settings()
        #quit()

        print "Using ", settings.number_proc, " processes"

        ##read data according to xml file settings
        if settings.dataset_type == "csv":  ##dados separados por , sendo a ultima coluna a label
            data = prepare_data.import_data_csv(settings.file_train[0], "")
        elif settings.dataset_type == "dense":  ##dense type from NIPS
            data = prepare_data.import_nips_dense(settings.file_train[0],
                                                  settings.file_train[1])
        elif settings.dataset_type == "sparse_binary":  ##sparse_binary type from NIPS
            data = prepare_data.import_nips_sparse_binary(
                settings.file_train[0], settings.file_train[1],
                settings.number_features)
        elif settings.dataset_type == "sparse_integer":  ##sparse_integer type from NIPS
            data = prepare_data.import_nips_sparse_integer(
                settings.file_train[0], settings.file_train[1],
                settings.number_features)
        else:
            print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer"
            quit()

        print "Read data with size ", len(data), " and ", len(
            data[0].values), " features."

        #create_smaller_dataset(data)

        ##normalize data
        params_norm = []
        data, params_norm = prepare_data.normalize_data_range(
            data
        )  ##return the params used for normalization to aplly on future data

        ##filter the irrelevant features
        features, used_bins, mi_scores = filter_part.filter_features(
            data, settings)  ##save the used bins to calculate future data

        print "selected _features:\n", features

        ##call the wrapper part
        cost, gamma = wrapper_part.wrapper(
            data, features, settings)  ##returns the used cost and gamma
        ##wrapper part is over
        print "program took: ", time.time() - start_t

        ##each process saves the top 5 subsets to a file
        f_res = open("res.csv", "r")
        lines = f_res.readlines()
        f_res.close()

        total_nodes = 0
        removed_nodes_by_cut = 0
        wasted_time = 0.0
        send_time = 0.0
        times_request_work = 0
        results = []
        times_work_not_sent = 0

        for res in lines:
            res = res.split(",")
            if "PROC" in res[0]:  ##ignore info lines
                total_nodes += int(res[2])
                removed_nodes_by_cut += int(res[4])
                wasted_time += float(res[5])
                send_time += float(res[6])
                times_request_work = int(res[7])
                times_work_not_sent = int(res[8])
                continue
            score = float(res[len(res) - 1])
            solution = res[:len(res) - 1]
            aux_solution = []
            for s in solution:  ##convert to ints
                aux_solution.append(int(s))
            results.append((aux_solution, score))
            #if score > best_score:
            #   best_score = score
            #  best_set = res=[:1]
        results.sort(key=lambda tup: tup[1])  ##order by score
        results.reverse()  ##Descend

        ##save the best subsets into a file
        outfile = open("bestsets.txt", "a")
        for res in results:
            outfile.write(str(res[0]) + "," + str(res[1]) + "\n")
        outfile.close()

        ##got results, now lets select test the validation part
        print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut
        print "Wasted time receiving:", wasted_time / float(
            settings.number_proc), " sending:", send_time / float(
                settings.number_proc
            ), " requested work:", times_request_work / float(
                settings.number_proc
            ), " times work not sent: ", times_work_not_sent
        print "Using c and g as parameters: ", cost, gamma
        print "best set ", results[0], " fts:", len(results[0][0])

        #quit()

        ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets
        ##mudar isto para testar todos os resultados no validation set e usar o melhr##
        nips_validation(data, results, mi_scores, params_norm, used_bins, cost,
                        gamma, settings)
示例#4
0
文件: main.py 项目: JSilva90/MITWS
def main():
    if __name__ == '__main__':
        start_t = time.time()
        
        ##read settings from the xml file
        settings = read_settings()
        #quit()
        
        print "Using ", settings.number_proc, " processes"
        
        
        ##read data according to xml file settings
        if settings.dataset_type == "csv": ##dados separados por , sendo a ultima coluna a label
            data = prepare_data.import_data_csv(settings.file_train[0], "")
        elif settings.dataset_type == "dense": ##dense type from NIPS
            data = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1])
        elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS
            data = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features)
        elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS
            data = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features)
        else:
            print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer"
            quit()
        
        print "Read data with size ", len(data), " and ", len(data[0].values), " features."
        
        
        
        #create_smaller_dataset(data)
        
        ##normalize data
        params_norm = []
        data, params_norm = prepare_data.normalize_data_range(data) ##return the params used for normalization to aplly on future data
        
        ##filter the irrelevant features
        features, used_bins, mi_scores = filter_part.filter_features(data, settings) ##save the used bins to calculate future data
        
        print "selected _features:\n", features 
        
        ##call the wrapper part
        cost, gamma = wrapper_part.wrapper(data, features, settings) ##returns the used cost and gamma
        ##wrapper part is over
        print "program took: ", time.time() - start_t 
        
        ##each process saves the top 5 subsets to a file
        f_res = open("res.csv", "r")
        lines = f_res.readlines()
        f_res.close()
        
        total_nodes = 0
        removed_nodes_by_cut = 0
        wasted_time = 0.0
        send_time = 0.0
        times_request_work = 0
        results = []
        times_work_not_sent = 0
        
        for res in lines:
            res = res.split(",")
            if "PROC" in res[0]: ##ignore info lines
                total_nodes += int(res[2])
                removed_nodes_by_cut += int(res[4])
                wasted_time += float(res[5])
                send_time += float(res[6])
                times_request_work = int(res[7])
                times_work_not_sent = int(res[8])
                continue
            score = float(res[len(res)-1])
            solution = res[:len(res)-1]
            aux_solution = []
            for s in solution: ##convert to ints
                aux_solution.append(int(s))
            results.append((aux_solution, score))
            #if score > best_score:
             #   best_score = score
              #  best_set = res=[:1]
        results.sort(key=lambda tup: tup[1]) ##order by score
        results.reverse() ##Descend
        
        ##save the best subsets into a file
        outfile = open("bestsets.txt", "a")
        for res in results:
            outfile.write(str(res[0]) + "," + str(res[1]) + "\n")
        outfile.close()
        
        
        ##got results, now lets select test the validation part
        print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut
        print "Wasted time receiving:", wasted_time / float(settings.number_proc), " sending:", send_time/float(settings.number_proc), " requested work:", times_request_work/float(settings.number_proc), " times work not sent: ", times_work_not_sent
        print "Using c and g as parameters: ", cost, gamma
        print "best set ", results[0], " fts:", len(results[0][0])
        
        #quit()
        
        ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets
        ##mudar isto para testar todos os resultados no validation set e usar o melhr##
        nips_validation(data, results, mi_scores, params_norm, used_bins, cost, gamma, settings)