Exemplo n.º 1
0
def get_improve_prob(features, n_tests, dataset, folds, n_proc, p_features, shared_lock, com, cost, gamma, svm_kernel):
    test_sets = utils.generate_random_sets(features, n_tests, 3, 10)
    accs = []
    for test in test_sets:  ##test each dataset agaisnt the classifier
        acc = classification_part.classify(folds, dataset, test, cost, gamma, svm_kernel)
        accs.append(acc)
    ##split tests among processes
    #print "accuracies subsets:", acc
    workers = []
    for i in range(1, n_proc):
        p = mp.Process(target=calculate_improve_prob, args=(i, p_features[i], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel))
        workers.append(p)
        p.start()
    
    calculate_improve_prob(0, p_features[0], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel)
    
    for w in workers:
        w.join()
    return
def calculate_improve_prob(id, my_features, dataset, test_sets, accuracies,
                           folds, lock, com, cost, gamma, svm_kernel):
    ##my_features returns a list with ml subsets, its easier if we pass all of them to a list of fts
    fts = []
    for i in my_features:
        fts.append(i.features)

    probs = []
    for ft in fts:  ##for every feature test against every subset
        improvements = 0
        for i in range(0, len(test_sets)):
            new_set = list(test_sets[i])
            if ft in new_set:  ##if ft was already on set remove it and check if the accuracy got worse
                new_set.remove(ft)
                added = False
            else:  ## if ft was not on the dataset check if accuracy improved
                new_set = new_set + ft
                added = True
            #print new_set
            acc = classification_part.classify(folds, dataset, new_set, cost,
                                               gamma, svm_kernel)
            if added:
                if acc > accuracies[i]:
                    improvements += 1
            else:
                if acc < accuracies[i]:
                    improvements += 1
            #print "obtained acc:", acc, "other acc:", accuracies[i], improvements

        prob = round((improvements / float(len(test_sets))), 2)
        probs.append((ft[0], prob))
    ##send results to pipe
    lock.acquire()
    output, input = com
    if output.poll():
        msg = output.recv(
        )  ##add what already was there and append this process results
        probs = msg + probs
    input.send(probs)
    lock.release()
Exemplo n.º 3
0
def get_improve_prob(features, n_tests, dataset, folds, n_proc, p_features, shared_lock, com, cost, gamma, svm_kernel):
    test_sets = utils.generate_random_sets(features, n_tests, 3, 10)
    accs = []
    for test in test_sets:  ##test each dataset agaisnt the classifier
        acc = classification_part.classify(folds, dataset, test, cost, gamma, svm_kernel)
        accs.append(acc)
    ##split tests among processes
    # print "accuracies subsets:", acc
    workers = []
    for i in range(1, n_proc):
        p = mp.Process(
            target=calculate_improve_prob,
            args=(i, p_features[i], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel),
        )
        workers.append(p)
        p.start()

    calculate_improve_prob(0, p_features[0], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel)

    for w in workers:
        w.join()
    return
Exemplo n.º 4
0
def calculate_improve_prob(id, my_features, dataset, test_sets, accuracies, folds, lock, com, cost, gamma, svm_kernel):
    ##my_features returns a list with ml subsets, its easier if we pass all of them to a list of fts
    fts = []
    for i in my_features:
        fts.append(i.features)

    probs = []
    for ft in fts:  ##for every feature test against every subset
        improvements = 0
        for i in range(0, len(test_sets)):
            new_set = list(test_sets[i])
            if ft in new_set:  ##if ft was already on set remove it and check if the accuracy got worse
                new_set.remove(ft)
                added = False
            else:  ## if ft was not on the dataset check if accuracy improved
                new_set = new_set + ft
                added = True
            # print new_set
            acc = classification_part.classify(folds, dataset, new_set, cost, gamma, svm_kernel)
            if added:
                if acc > accuracies[i]:
                    improvements += 1
            else:
                if acc < accuracies[i]:
                    improvements += 1
            # print "obtained acc:", acc, "other acc:", accuracies[i], improvements

        prob = round((improvements / float(len(test_sets))), 2)
        probs.append((ft[0], prob))
    ##send results to pipe
    lock.acquire()
    output, input = com
    if output.poll():
        msg = output.recv()  ##add what already was there and append this process results
        probs = msg + probs
    input.send(probs)
    lock.release()
Exemplo n.º 5
0
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins,
                    cost, gamma, settings):
    dataset_name = settings.dataset_name
    ##read the 3 sets of data: train, validation and test
    if settings.dataset_type == "dense":  ##dense type from NIPS
        data_train = prepare_data.import_nips_dense(settings.file_train[0],
                                                    settings.file_train[1])
        data_valid = prepare_data.import_nips_dense(settings.file_valid[0],
                                                    settings.file_valid[1])
        data_test = prepare_data.import_nips_dense(settings.file_test, "")
    elif settings.dataset_type == "sparse_binary":  ##sparse_binary type from NIPS
        data_train = prepare_data.import_nips_sparse_binary(
            settings.file_train[0], settings.file_train[1],
            settings.number_features)
        data_valid = prepare_data.import_nips_sparse_binary(
            settings.file_valid[0], settings.file_valid[1],
            settings.number_features)
        data_test = prepare_data.import_nips_sparse_binary(
            settings.file_test[0], "", settings.number_features)
    elif settings.dataset_type == "sparse_integer":  ##sparse_integer type from NIPS
        data_train = prepare_data.import_nips_sparse_integer(
            settings.file_train[0], settings.file_train[1],
            settings.number_features)
        data_valid = prepare_data.import_nips_sparse_integer(
            settings.file_valid[0], settings.file_valid[1],
            settings.number_features)
        data_test = prepare_data.import_nips_sparse_integer(
            settings.file_test[0], "", settings.number_features)

    ##normalize the 3 sets with the normalization parameters used during the feature selection process
    data_train = prepare_data.apply_normalization(data_train, params_norm)
    data_valid = prepare_data.apply_normalization(data_valid, params_norm)
    data_test = prepare_data.apply_normalization(data_test, params_norm)

    validation_results = {}  ##save results of validation

    ##create variables to test and find the accuracy of the train and valid sets
    aux_data_1 = data + data_train
    folds_1 = [(range(0,
                      len(data)), range(len(data),
                                        len(data) + len(data_train)))]

    aux_data_2 = data + data_valid
    folds_2 = [(range(0,
                      len(data)), range(len(data),
                                        len(data) + len(data_valid)))]
    for i in range(0, len(best_subsets)
                   ):  ##test every subset and check which generalizes best
        acc_train = classification_part.classify(folds_1, aux_data_1,
                                                 best_subsets[i][0], cost,
                                                 gamma, settings.svm_kernel)
        acc_valid = classification_part.classify(folds_2, aux_data_2,
                                                 best_subsets[i][0], cost,
                                                 gamma, settings.svm_kernel)
        validation_results[i] = (acc_train, acc_valid)

    ##selection the subset that was able to obtain the best score for both sets... this could be changed
    top_score_1 = 0.0
    top_score_2 = 0.0
    top_subset = ""
    top_score = 0.0
    for i in validation_results:
        print best_subsets[i][0], validation_results[i]
        score_1 = validation_results[i][0]
        score_2 = validation_results[i][1]
        if score_1 + score_2 > top_score:
            top_score = score_1 + score_2
            top_score_1 = score_1
            top_score_2 = score_2
            top_subset = best_subsets[i][0]
        elif score_1 + score_2 == top_score:  ##case where they have same percentage
            if abs(score_1 - score_2) < abs(top_score_1 - top_score_2):
                top_score = score_1 + score_2
                top_score_1 = score_1
                top_score_2 = score_2
                top_subset = best_subsets[i][0]

    print top_score_1, top_score_2, "selected subset:", top_subset

    ##create the nips file for each set
    classify_data(data, top_subset, dataset_name + "_train", data_train, cost,
                  gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost,
                  gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_test", data_test, cost,
                  gamma, settings.svm_kernel)

    ##write the selected features to the file using the MI score as sort criterion
    top_subset = order_importance_of_features(top_subset, mi_scores)
    f_fts = open("results/" + dataset_name + ".feat", "a")

    for ft in top_subset:
        f_fts.write(str(int(ft) + 1) + "\n")
    f_fts.close()
Exemplo n.º 6
0
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins, cost, gamma, settings):
    dataset_name = settings.dataset_name
    ##read the 3 sets of data: train, validation and test
    if settings.dataset_type == "dense": ##dense type from NIPS
        data_train = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1])
        data_valid = prepare_data.import_nips_dense(settings.file_valid[0], settings.file_valid[1])
        data_test = prepare_data.import_nips_dense(settings.file_test, "")
    elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS
        data_train = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features)
        data_valid = prepare_data.import_nips_sparse_binary(settings.file_valid[0], settings.file_valid[1], settings.number_features)
        data_test = prepare_data.import_nips_sparse_binary(settings.file_test[0], "", settings.number_features)
    elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS
        data_train = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features)
        data_valid = prepare_data.import_nips_sparse_integer(settings.file_valid[0], settings.file_valid[1], settings.number_features)
        data_test = prepare_data.import_nips_sparse_integer(settings.file_test[0], "", settings.number_features)
    
    ##normalize the 3 sets with the normalization parameters used during the feature selection process
    data_train = prepare_data.apply_normalization(data_train, params_norm)
    data_valid = prepare_data.apply_normalization(data_valid, params_norm)
    data_test = prepare_data.apply_normalization(data_test, params_norm)
    
    
    validation_results = {} ##save results of validation
    
    ##create variables to test and find the accuracy of the train and valid sets
    aux_data_1 = data + data_train
    folds_1 = [(range(0,len(data)), range(len(data), len(data) + len(data_train)))]
    
    aux_data_2 = data + data_valid
    folds_2 = [(range(0,len(data)), range(len(data), len(data) + len(data_valid)))]
    for i in range(0, len(best_subsets)): ##test every subset and check which generalizes best    
        acc_train = classification_part.classify(folds_1, aux_data_1, best_subsets[i][0], cost, gamma, settings.svm_kernel)
        acc_valid = classification_part.classify(folds_2, aux_data_2, best_subsets[i][0], cost, gamma, settings.svm_kernel)
        validation_results[i] = (acc_train, acc_valid)
    
    ##selection the subset that was able to obtain the best score for both sets... this could be changed
    top_score_1 = 0.0
    top_score_2 = 0.0
    top_subset = ""
    top_score = 0.0
    for i in validation_results:
        print best_subsets[i][0], validation_results[i]
        score_1 = validation_results[i][0]
        score_2 = validation_results[i][1]
        if score_1 + score_2 > top_score:
            top_score = score_1 + score_2
            top_score_1 = score_1
            top_score_2 = score_2
            top_subset = best_subsets[i][0]
        elif score_1 + score_2 == top_score: ##case where they have same percentage
            if abs(score_1 - score_2) < abs(top_score_1 - top_score_2):
                top_score = score_1 + score_2
                top_score_1 = score_1
                top_score_2 = score_2
                top_subset = best_subsets[i][0]
                
    
    print top_score_1, top_score_2 , "selected subset:", top_subset
    
    ##create the nips file for each set
    classify_data(data, top_subset, dataset_name + "_train", data_train, cost, gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost, gamma, settings.svm_kernel)
    classify_data(data, top_subset, dataset_name + "_test", data_test, cost, gamma, settings.svm_kernel)
    
    ##write the selected features to the file using the MI score as sort criterion
    top_subset = order_importance_of_features(top_subset, mi_scores)
    f_fts = open("results/" + dataset_name + ".feat", "a")
    
    for ft in top_subset:
        f_fts.write(str(int(ft)+1) + "\n")
    f_fts.close()
Exemplo n.º 7
0
def worker_classification(id, work_list, comb_memory, data, features, folds, settings, pipes, lock, global_info, probs, cost, gamma, score_lock, work_lock):
    n_proc = settings.number_proc
    t_work_ask = 0
    worker_start = time.time()
    mec_cut_nodes = 0 ##number of nodes removed by the second phase mechanism
    #filename = "outputs/out" + str(id) + ".txt" ## to save outputs in case its needed
    best_sets = {}
    update_rate = 10
    count = update_rate
    number_of_tests = 0
    tested_sets = {}  ##to save own tests on process
    #info = []
    rts = []
    depth = False
    last_update = 0.0 ##last time it updated the best global accuracy
    wasted_time = 0.0 ##debug to count wasted time in exchanging work
    send_time = 0.0
    times_work_not_sent = 0
    
    while (True):
        rt = time.time() ##to check how many time it takes to process a subset
        ##if process has no work
        if work_list ==  []: ##ask for work
            t_work_ask += 1 ## to count how many times this process requested work
            ask_time = time.time() ## measure time to ask
            #dont let processes ask for work right after start, make them wait x seconds in order to other processes have time to generate enough work to share
            if number_of_tests < 25: ## the last process when generates their expansions all are repeated and it runs out of work quickly so we have to make him wait before testing
                time.sleep(4)
            work_list = ask_for_work(id, global_info, lock, pipes, n_proc, work_lock)
            aux_time = time.time() - ask_time
            aux_time = round(aux_time,2)
            wasted_time += aux_time
            if work_list == []: ##if it received no work
                break
            
        ##gather a subset to test and remove it from work
        test_subset = work_list[len(work_list)-1]  ##get the subset to test
        del(work_list[len(work_list)-1]) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end
        
        if depth:
            work_list, last_update, aux_cut = check_cutting(id, last_update, global_info, work_list, settings.search_cutting_time)
            mec_cut_nodes += aux_cut
        else:   ##switch to depth first search and activate the sampling
            if len(test_subset.features) > settings.change_search_size: ##switch stages search
                print "AT_SWITCH:" + str(number_of_tests) + "," + str(len(work_list))
                last_update = time.time() ##time to start measure the updates
                depth = True
            
        
        ##classify subset
        score = classification_part.classify(folds, data, test_subset.features, cost, gamma, settings.svm_kernel)
        
       # info.append((test_subset.features, score)) ##to save all results on count
        test_subset.parents_scores.append(score)
        
        number_of_tests += 1 ##increase number of tests
        
        
        if checkExpand(test_subset, global_info, depth, settings): ##if it's worth expand the node
            work_list = expand_node(id, work_list, comb_memory, test_subset, features, n_proc, tested_sets, depth, probs, settings.estimate_probs) ##expand the node
        
        last_update = update_score(global_info, score, test_subset, best_sets, score_lock, last_update) ##update the top scores
        rts.append(time.time() - rt)
        
        ##update global information 
        count -= 1
        
        ##if process has chosen to send
        if global_info.chosen_to_send == id: ##i have been chosen to send work to someone
            stime = time.time()
            work_list, aux = send_work(id, work_list, global_info, pipes, lock, work_lock)
            times_work_not_sent += aux
            aux_time = time.time() - stime
            aux_time = round(aux_time,2)
            send_time += aux_time 
            count = update_rate ##to update the work list
        
        
        if count < 0:
            ##update global size of work for the process
            ##this is used to give some feedback to the user from time to time, also to globally update the amount of the work of the process
            
            count = update_rate##number of tests untill output again
            update_my_work(id, work_list, global_info, work_lock) 
            
            ##debug info
            sum = 0
            for r in rts:
                sum += r
            avg = sum / float(len(rts))
            avg = round(avg, 4)
            print id, ",", avg, "," , max(best_sets), ",", int(time.time() - worker_start), "," , len(work_list) , ",", str(best_sets[max(best_sets)].features)  
            rts = []
            
            
    total_working_time = time.time() - worker_start
    
    #file_write = open(filename, "a")
    #for t in debug_data:
     #   file_write.write(str(debug_data[t][0]) + "," + str(debug_data[t][1]) + "," + str(debug_data[t][2]) + "\n")
    #file_write.close()
    lock.acquire()
    out_file = open("res.csv", "a")
    out_file.write("PROCESS" + str(id) + "," + str(total_working_time) + "," + str(number_of_tests) + "," + str(max(best_sets)) + "," +str(mec_cut_nodes) + "," + str(wasted_time) + "," + str(send_time) + "," + str(t_work_ask) + ","+ str(times_work_not_sent) + "\n")
    
    for score in best_sets:
        set_info = ""
        for ft in best_sets[score].features:
            set_info += str(ft) + ","
        set_info += str(score) + "\n"
        out_file.write(set_info)
    
    out_file.close()
    lock.release()
Exemplo n.º 8
0
def worker_classification(id, work_list, comb_memory, data, features, folds,
                          settings, pipes, lock, global_info, probs, cost,
                          gamma, score_lock, work_lock):
    n_proc = settings.number_proc
    t_work_ask = 0
    worker_start = time.time()
    mec_cut_nodes = 0  ##number of nodes removed by the second phase mechanism
    #filename = "outputs/out" + str(id) + ".txt" ## to save outputs in case its needed
    best_sets = {}
    update_rate = 10
    count = update_rate
    number_of_tests = 0
    tested_sets = {}  ##to save own tests on process
    #info = []
    rts = []
    depth = False
    last_update = 0.0  ##last time it updated the best global accuracy
    wasted_time = 0.0  ##debug to count wasted time in exchanging work
    send_time = 0.0
    times_work_not_sent = 0

    while (True):
        rt = time.time()  ##to check how many time it takes to process a subset
        ##if process has no work
        if work_list == []:  ##ask for work
            t_work_ask += 1  ## to count how many times this process requested work
            ask_time = time.time()  ## measure time to ask
            #dont let processes ask for work right after start, make them wait x seconds in order to other processes have time to generate enough work to share
            if number_of_tests < 25:  ## the last process when generates their expansions all are repeated and it runs out of work quickly so we have to make him wait before testing
                time.sleep(4)
            work_list = ask_for_work(id, global_info, lock, pipes, n_proc,
                                     work_lock)
            aux_time = time.time() - ask_time
            aux_time = round(aux_time, 2)
            wasted_time += aux_time
            if work_list == []:  ##if it received no work
                break

        ##gather a subset to test and remove it from work
        test_subset = work_list[len(work_list) - 1]  ##get the subset to test
        del (
            work_list[len(work_list) - 1]
        )  ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end

        if depth:
            work_list, last_update, aux_cut = check_cutting(
                id, last_update, global_info, work_list,
                settings.search_cutting_time)
            mec_cut_nodes += aux_cut
        else:  ##switch to depth first search and activate the sampling
            if len(test_subset.features
                   ) > settings.change_search_size:  ##switch stages search
                print "AT_SWITCH:" + str(number_of_tests) + "," + str(
                    len(work_list))
                last_update = time.time()  ##time to start measure the updates
                depth = True

        ##classify subset
        score = classification_part.classify(folds, data, test_subset.features,
                                             cost, gamma, settings.svm_kernel)

        # info.append((test_subset.features, score)) ##to save all results on count
        test_subset.parents_scores.append(score)

        number_of_tests += 1  ##increase number of tests

        if checkExpand(test_subset, global_info, depth,
                       settings):  ##if it's worth expand the node
            work_list = expand_node(id, work_list, comb_memory, test_subset,
                                    features, n_proc, tested_sets, depth,
                                    probs,
                                    settings.estimate_probs)  ##expand the node

        last_update = update_score(global_info, score, test_subset, best_sets,
                                   score_lock,
                                   last_update)  ##update the top scores
        rts.append(time.time() - rt)

        ##update global information
        count -= 1

        ##if process has chosen to send
        if global_info.chosen_to_send == id:  ##i have been chosen to send work to someone
            stime = time.time()
            work_list, aux = send_work(id, work_list, global_info, pipes, lock,
                                       work_lock)
            times_work_not_sent += aux
            aux_time = time.time() - stime
            aux_time = round(aux_time, 2)
            send_time += aux_time
            count = update_rate  ##to update the work list

        if count < 0:
            ##update global size of work for the process
            ##this is used to give some feedback to the user from time to time, also to globally update the amount of the work of the process

            count = update_rate  ##number of tests untill output again
            update_my_work(id, work_list, global_info, work_lock)

            ##debug info
            sum = 0
            for r in rts:
                sum += r
            avg = sum / float(len(rts))
            avg = round(avg, 4)
            print id, ",", avg, ",", max(best_sets), ",", int(
                time.time() - worker_start), ",", len(work_list), ",", str(
                    best_sets[max(best_sets)].features)
            rts = []

    total_working_time = time.time() - worker_start

    #file_write = open(filename, "a")
    #for t in debug_data:
    #   file_write.write(str(debug_data[t][0]) + "," + str(debug_data[t][1]) + "," + str(debug_data[t][2]) + "\n")
    #file_write.close()
    lock.acquire()
    out_file = open("res.csv", "a")
    out_file.write("PROCESS" + str(id) + "," + str(total_working_time) + "," +
                   str(number_of_tests) + "," + str(max(best_sets)) + "," +
                   str(mec_cut_nodes) + "," + str(wasted_time) + "," +
                   str(send_time) + "," + str(t_work_ask) + "," +
                   str(times_work_not_sent) + "\n")

    for score in best_sets:
        set_info = ""
        for ft in best_sets[score].features:
            set_info += str(ft) + ","
        set_info += str(score) + "\n"
        out_file.write(set_info)

    out_file.close()
    lock.release()