예제 #1
0
def main(argv):

    __TOTAL_RUNNINGS = 100
    __CLASSIFIER = 'knn'  #'nbayes'

    # KNN Parameters
    K_VALUE = 3
    KNN_DEBUG = False
    pool = False
    cv_type = 'kcv'
    #cv_type = ''

    # Seeds test
    USE_KNOWN_GOOD_SLICE_GROUPING = True

    # Use this arguments to set the input directory of attributes files
    __USE_SAMPLE_DATA_DIR = True
    __SAMPLE_DATA_DIR = "../../attributes_amostra"
    __FULL_DATA_DIR = "../../attributes2"

    attributes_dir = __FULL_DATA_DIR
    csv_file = './ADNI1_Complete_All_Yr_3T.csv'

    if __USE_SAMPLE_DATA_DIR:
        attributes_dir = __SAMPLE_DATA_DIR

    # Getting all data

    start_time = time.time()
    print('Loading all atributes data... ', end='')
    attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data(
        attributes_dir, csv_file)
    end_time = time.time()
    total_time = end_time - start_time
    print('done (total time to load: {0})'.format(total_time))

    import deap_alzheimer
    min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0]
    valid_bplanes = loadattribs.getBplanes(slice_amounts)

    print('Slice Limits:', min_slices_values)
    print('valid_bplanes=', valid_bplanes)

    if USE_KNOWN_GOOD_SLICE_GROUPING:
        print('\n* Using a specific known good slice grouping... ', end='')

        bplane, start_slice, total_slices = [2, 120, 20]
    else:
        print('\n* Building a random valid slice grouping... ', end='')

        bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping(
            planes=valid_bplanes,
            length=int(random.random() * 20),
            max_indexes=min_slices_values,
            dbug=False)

    print('\nDone! Slice grouping: [{0}, {1}, {2}]'.format(
        bplane, start_slice, total_slices))

    start_time = time.time()

    # Getting some data partition
    print(
        '\n* Getting the specific data partition using this slice grouping {0}... '
        .format([bplane, start_slice, total_slices]),
        end='')
    data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping(
        attribs, slice_amounts, bplane, start_slice, total_slices)
    end_time = time.time()
    total_time = end_time - start_time
    print('done.\n\tTotal time to get the data partition= {0}'.format(
        total_time, ))
    print('\n* Data Partition\'s shape= ', data_partition.shape)

    start_time = time.time()
    print(
        '\n* Starting to run knn classifier {0} times to evaluate this data partition...'
        .format(__TOTAL_RUNNINGS))
    all_acc = []
    all_cmat = []

    for r in list(range(__TOTAL_RUNNINGS)):
        accuracy, conf_matrix = runKNN(data_partition,
                                       output_classes,
                                       __CLASSIFIER,
                                       K_VALUE,
                                       knn_debug=KNN_DEBUG,
                                       use_smote=True,
                                       use_rescaling=True,
                                       cv_type=cv_type,
                                       use_Pool=pool)
        all_acc.append(accuracy)
        all_cmat.append(conf_matrix)

    end_time = time.time()
    total_time = end_time - start_time
    print('done.')

    all_acc = np.array(all_acc)

    best_acc = all_acc.max()
    best_acc_pos = all_acc.argmax()
    best_acc_cmat = all_cmat[best_acc_pos]
    worst_acc_cmat = all_cmat[all_acc.argmin()]

    print('\n* Results afer {1} runnings:\n{0}'.format(all_acc,
                                                       __TOTAL_RUNNINGS))
    print('\ttime to run classifier={0}'.format(total_time))
    print('\tclassifier={0}'.format(__CLASSIFIER))
    print('\tmean={0}'.format(np.mean(all_acc)))
    print('\tvariance={0}'.format(all_acc.var()))
    print('\tstd={0}'.format(all_acc.std()))
    print('\tmax={0}'.format(best_acc))
    print('\tmin={0}'.format(all_acc.min()))
    print('\tConfusion matrix of the best result:\n', best_acc_cmat)
    print('\tConfusion matrix of the worst result:\n', worst_acc_cmat)
    return 0
예제 #2
0
def main(argv):

    # runtime parameters
    __TOTAL_RUNNINGS = 1
    __MULTIPROCESS = False
    __USE_SAMPLE_DATA_DIR = True  # Use this arguments to set the input directory of attributes files
    USE_FIXED_SLICE_GROUPING = True  # Seeds test
    __VERBOSE = True

    # Models Parameters
    __USE_PCA = True
    __USE_STRATIFIED_KFOLD = True
    knn_k_value = 3
    lr_solver = 'sag'
    lr_multiclass = 'ovr'
    kcv_folds = 11

    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    FIXED_SLICE_GROUPING = [2, 105, 8]

    # Use this arguments to set the input directory of attributes files
    __SAMPLE_DATA_DIR = "../../attributes_amostra"
    __FULL_DATA_DIR = "../../attributes2"
    attributes_dir = __FULL_DATA_DIR
    csv_file = './ADNI1_Complete_All_Yr_3T.csv'

    if __USE_SAMPLE_DATA_DIR:
        attributes_dir = __SAMPLE_DATA_DIR

    # Getting all data

    start_time = time.time()
    print('Loading all atributes data... ', end='')
    attribs, body_planes, slice_num, slice_amounts, output_classes, all_genders, all_ages, demographics_dic = loadattribs.load_all_data(
        attributes_dir, csv_file)
    end_time = time.time()
    total_time = end_time - start_time
    print('done (total time to load: {0:.2f}s)'.format(total_time))

    import deap_alzheimer
    min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0]
    valid_bplanes = loadattribs.getBplanes(slice_amounts)

    #print('Slice Limits:',min_slices_values)
    #print('valid_bplanes=',valid_bplanes)

    if USE_FIXED_SLICE_GROUPING:
        #print('* Using a specific known good slice grouping... ', end='')

        bplane, start_slice, total_slices = FIXED_SLICE_GROUPING
    else:
        #print('* Building a random valid slice grouping... ', end='')

        bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping(
            planes=valid_bplanes,
            length=int(random.random() * 20),
            max_indexes=min_slices_values,
            dbug=False)

    #print('Done!\n* Slice grouping created: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices))

    #start_time = time.time()

    # Getting some data partition
    #print('* Getting the specific data partition using this slice grouping {0}... '.format([bplane,start_slice,total_slices]),end='')
    data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping(
        attribs, slice_amounts, bplane, start_slice, total_slices)
    #end_time = time.time()
    #total_time = end_time - start_time
    #print('done.\n* Total time to get the data partition= {0}'.format(total_time,))

    # Preparing data to use with PANDAS
    # Data preparation
    #print('* Current Data Partition\'s shape= ',data_partition.shape)
    try:
        new_dimensions = (data_partition.shape[0],
                          data_partition.shape[1] * data_partition.shape[2])
    except IndexError:
        print(
            '** IndexValue exception: data_partition.shape={0} output_classes.shape={1}'
            .format(data_partition.shape, output_classes.shape))
        sys.exit(-1)

    # Reshapping X_data
    X_reshaped = np.reshape(data_partition, new_dimensions)
    #print('* New Data Partition\'s shape= ',X_reshaped.shape)
    #print('* New dimensions (must be equal to X_reshaped.shape): ',new_dimensions)

    X_pandas = pd.DataFrame(data=X_reshaped)
    y_pandas = pd.DataFrame(data=output_classes)

    # Getting models list
    models_names = []
    models_names.append('KNN')
    #    models_names.append('LDA')
    #    models_names.append('CART')
    #    models_names.append('NB')
    #    models_names.append('SVM')
    #    models_names.append('RF')
    #    models_names.append('LR')
    #build_models_list(knn_k_value,lr_solver,lr_multiclass)

    # All results pool
    all_models_results = []

    #    # Initializing pool of results
    #    for model_name in models_names:
    #        model_result = []
    #        metrics_values = all_metrics_values()
    #        for metric in metrics_values:
    #            model_result.append([])
    #        all_models_results.append(model_result)

    #    all_mean_acc = []
    #    all_median_acc = []
    #    all_median_cmat = []
    #    all_time = []

    for n in list(range(__TOTAL_RUNNINGS)):
        ## all_experiments_results is a LIST of LISTS of Dictionaries which have as keys:

        ############################################################################################
        #        n_experiment_results = evaluate_all(X_pandas, y_pandas,
        #                                            knn_k_value,lr_solver,lr_multiclass,
        #                                            kcv_folds=kcv_folds,
        #                                            use_multiprocess=__MULTIPROCESS)
        ############################################################################################
        cv_seed = 7
        cv_shuffle = True
        use_multiprocess = __MULTIPROCESS
        use_smote = True
        use_rescaling = True

        # Getting how many cpus are avaliable
        cores_num = multiprocessing.cpu_count()  # used by n_jobs

        # Validation setup
        cv = model_selection.KFold(n_splits=kcv_folds,
                                   random_state=cv_seed,
                                   shuffle=cv_shuffle)
        both_indexes = cv.split(X_pandas)

        # Current Experiment: Results from each model
        experiment_results = []

        if use_multiprocess and __name__ == "__main__":
            with Pool(cores_num) as p:
                from functools import partial
                experiment_results = p.map(
                    partial(evaluate_model_using_smote_and_rescaling,
                            all_train_and_test_indexes=both_indexes,
                            X_data=X_pandas,
                            y_data=y_pandas,
                            folds=kcv_folds,
                            smote=use_smote,
                            rescaling=use_rescaling,
                            cores_num=cores_num,
                            maximization=True,
                            pca=__USE_PCA,
                            stratified_kfold=__USE_STRATIFIED_KFOLD),
                    models_names)

        else:
            for model in models_names:
                #print('* Evaluation {0} model...'.format(model[0]))
                model_results = evaluate_model_using_smote_and_rescaling(
                    both_indexes,
                    X_pandas,
                    y_pandas,
                    model,
                    kcv_folds,
                    smote=use_smote,
                    rescaling=use_rescaling,
                    cores_num=cores_num,
                    stratified_kfold=__USE_STRATIFIED_KFOLD,
                    pca=__USE_PCA)
                experiment_results.append(model_results)

            all_models_results.append(experiment_results)

        #print (all_results['KNN'])


########################################################
# Compiling result data from this experiment

#        model_mean_acc = []
#        model_median_acc = []
#        model_median_cmat = []
#        model_time = []
#
#        for model_results in n_experiment_results:
#            # 'model_results' is a dicionary
#
#            model_mean_acc.append(model_results['mean_acc'])
#            model_median_acc.append(model_results['median_acc'])
#            model_median_cmat.append(model_results['median_cmat'])
#            model_time.append(model_results['total_time'])
#
#        np_mean_acc = np.array(model_mean_acc)
#        np_time = np.array(model_time)
#
#        all_mean_acc.append(np_mean_acc.mean())
#        all_median_acc.append(model_median_acc[len(model_median_acc)//2])
#        all_median_cmat.append(model_median_cmat[len(model_median_cmat)//2])
#        all_time.append(np_time.mean)
#
#    models = build_models_list()
#
#    # Printing all compliled data
#    print('* All {0:03d} experiments results:'.format(__TOTAL_RUNNINGS))
#
#
#
#    for model in range(len(models)):
#        name = models[model][0]
#        print('model {0}:'.format(name),end='')
##        #print('all_mean_acc[0].__class__.__name__=',all_mean_acc[0].__class__.__name__)
#
#        #np_all_acc_mean = np.array(all_mean_acc[model])
#        #print(' acc_mean=',np_all_acc_mean,end='')
#
#        print(' acc_median={0}'.format(all_median_acc[model]),end='')
#
#
#        print('\n')

#        mean_acc = np_all_mean.mean()
#        std_acc = np_all_mean.std()
#        max_acc = np.argmax(np_all_mean)
#        min_acc = np.argmin(np_all_mean)
#
#        median_pos = kcv_folds // 2 if kcv_folds % 2 == 1 else None
#        median_acc = all_median_acc[exp][median_pos] if kcv_folds % 2 == 1 else None
#        median_cmat = (all_median_cmat[exp][median_pos] if kcv_folds % 2 == 1 else None)
#
#        np_all_time = np.array(all_time[exp])
#        total_time = np_all_time.mean()
#
#        print('{0}:\tmean_acc={1:.4f} mean_std={2:.4f} median_acc={3:.4f} median_cmat={4} max_acc{5:.4f} min_acc={5:.4f} total_time={6:.4f}s'.format(name, mean_acc, std_acc, median_acc, str(median_cmat), max_acc, min_acc, total_time))

    return 0
예제 #3
0
def main(argv):

    # KNN Parameters
    K_VALUE = 5

    # Seeds test
    USE_KNOWN_GOOD_SLICE_GROUPING = True

    # Use this arguments to set the input directory of attributes files
    __USE_SAMPLE_DATA_DIR = False
    __SAMPLE_DATA_DIR = "../../attributes_amostra"
    __FULL_DATA_DIR = "../../attributes2"

    attributes_dir = __FULL_DATA_DIR
    csv_file = './ADNI1_Complete_All_Yr_3T.csv'

    if __USE_SAMPLE_DATA_DIR:
        attributes_dir = __SAMPLE_DATA_DIR

    # Getting all data

    start_time = time.time()
    print('Loading all atributes data... ', end='')
    attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data(
        attributes_dir, csv_file)
    end_time = time.time()
    total_time = end_time - start_time
    print('done (total time to load: {0})'.format(total_time))

    import deap_alzheimer
    min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0]
    valid_bplanes = loadattribs.getBplanes(slice_amounts)

    print('Slice Limits:', min_slices_values)
    print('valid_bplanes=', valid_bplanes)

    #    def getRandomSliceGrouping(all_slice_amounts,best of mozart
    #                           planes = __DEFAULT_BPLANES,
    #                           max_length = __DEFAULT_MAX_CONSEC_SLICES,
    #                           max_indexes = __DEFAULT_MAX_SLICES_VALUES,    # Maximum value for the first slice index
    #                           dbug=__DEFAULT_DEBUG):

    if USE_KNOWN_GOOD_SLICE_GROUPING:
        print('* Using a specific known good slice grouping... ', end='')

        bplane, start_slice, total_slices = [2, 114, 15]
    else:
        print('* Building a random valid slice grouping... ', end='')

        bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping(
            planes=valid_bplanes,
            length=30,
            max_indexes=min_slices_values,
            dbug=False)

    print('done. Slice grouping: [{0}, {1}, {2}]'.format(
        bplane, start_slice, total_slices))

    start_time = time.time()

    # Getting some data partition
    print(
        '* Getting a random data partition using this slice grouping {0}... '.
        format([bplane, start_slice, total_slices]),
        end='')
    data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping(
        attribs, slice_amounts, bplane, start_slice, total_slices)
    end_time = time.time()
    total_time = end_time - start_time
    print('done.\n\tTotal time to get the data partition= {0}'.format(
        total_time, ))
    print('* Data Partition\'s shape= ', data_partition.shape)

    start_time = time.time()
    print(
        '* Starting to run knn classifier to evaluate this data partition...')
    accuracy, conf_matrix = runKNN(data_partition,
                                   output_classes,
                                   K_VALUE,
                                   knn_debug=True,
                                   use_smote=True,
                                   use_rescaling=True)

    end_time = time.time()
    total_time = end_time - start_time
    print('done. Total time to run classifier= {0}'.format(total_time))

    print('\n* Confusion matrix was:\n', conf_matrix)
    print('* KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy))

    return 0
예제 #4
0
def main(argv):
    __USE_DATA_SAMPLE = True
    
    
    # KNN Parameters
    K_VALUE = 5
    
    # Use this arguments to set the input directory of attributes files
    attributes_dir = "../../attributes_amostra"
    
    if not __USE_DATA_SAMPLE:
        attributes_dir = "../../attributes2"
    csv_file = './ADNI1_Complete_All_Yr_3T.csv'
    
    # Getting all data
    
    start_time = time.time()
    print('Loading all atributes data...')
    attribs, body_planes, slice_num, slice_amounts, output_classes = loadattribs.load_all_data(attributes_dir, csv_file)
    end_time = time.time()
    total_time = end_time - start_time
    print('...done (total time to load: {0})'.format(total_time))
    
    import deap_alzheimer
    min_slices_values = loadattribs.getSliceLimits(slice_amounts)[0]
    valid_bplanes = loadattribs.getBplanes(slice_amounts)

    print('Slice Limits:',min_slices_values)
    print('valid_bplanes=',valid_bplanes)
    
    
#    def getRandomSliceGrouping(all_slice_amounts,best of mozart
#                           planes = __DEFAULT_BPLANES,
#                           max_length = __DEFAULT_MAX_CONSEC_SLICES,
#                           max_indexes = __DEFAULT_MAX_SLICES_VALUES,    # Maximum value for the first slice index 
#                           dbug=__DEFAULT_DEBUG):
    
    print('Getting a random valid slice grouping...')
    bplane, start_slice, total_slices = deap_alzheimer.buildRandomSliceGrouping(
            planes=valid_bplanes,
            length = 30,
            max_indexes = min_slices_values,
            dbug=False)
    print('...done')
    
    print('slice grouping found:\n\tbplane={0},first_slice={1},total_slices={2}'.format(bplane,start_slice,total_slices))
    print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices))
    
    
    start_time = time.time()
     
    # Getting some data partition 
    print('Getting some data partition using this last slice grouping ({0})...'.format((bplane,start_slice,total_slices)))
    data_partition = loadattribs.getAttribsPartitionFromSingleSlicesGrouping(attribs,
                                                          slice_amounts,
                                                          bplane,
                                                          start_slice,
                                                          total_slices)
    
    
    end_time = time.time()
    total_time = end_time - start_time
    print('...done \nTotal time to get the data partition (bplane={1},first_slice={2},total_slices={3}): {0}'.format(total_time,bplane,start_slice,total_slices))
    

    start_time = time.time()
    print('Starting to run knn classifier to evaluate this partition of data...')
    accuracy, conf_matrix = knn_alzheimer.runKNN(data_partition, output_classes, K_VALUE,use_smote=False,use_rescaling=False)
    
    end_time = time.time()
    total_time = end_time - start_time
    print('...done (total time to run classifier: {0})'.format(total_time))
    
    print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices))
    print('\nConfusion matrix was:\n', conf_matrix)
    print ('KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy))
    
    
    ###########################################################################
    
    K_VALUE = 5
    # SMOTE BEGINS HERE
    smote_debug = True
    print('* a partition data shape=',data_partition.shape)
    
    # Data preparation
    try:
        # Transforming 3D data to a 2D data
        new_dimensions = (data_partition.shape[0],
                          data_partition.shape[1]*data_partition.shape[2])
    except IndexError:
        print('** IndexValue exception')
        print('\tdata_partition.shape=',data_partition.shape)
        print('\output_classes.shape=',output_classes.shape)
        print('\t')
        sys.exit(-1)
    
    
    new_partition = np.reshape(data_partition, new_dimensions)
    
    #scaled_new_partition = preprocessing.scale(new_partition)

    
    used_partition = new_partition
    
    for i in range(2):
        if i == 1:
            print('\n*** NOW we will do the same however using scaled and balanced data:')
            #used_partition = scaled_new_partition
        

        if smote_debug and False: 
            print('* DIMENSION for the new partition array=',new_dimensions)
            print('* the new partition data shape=',used_partition.shape)
            print('* the output array shape=',output_classes.shape)
            print('* shape of an input instance retrived from the new partition=', used_partition[0].shape)
    
        ## KNN preparation
        X_pandas = pd.DataFrame(data=used_partition)
        #print('X_pandas=\n',X_pandas)
        #_pandas = pd.DataFrame(data=np.ravel(output_classes,order='C'))
        y_pandas = pd.DataFrame(data=output_classes)
        y_pandas.columns = ['Class']
        #print('y_pandas=\n',y_pandas)
        #print('y_pandas values (without balancing)=\n',pd.value_counts(y_pandas['Class']))
        if i == 1: # STANDARDIZING...
            # Get column names first
            #names = ['Class']
            # Create the Scaler object
            scaler = preprocessing.StandardScaler()
            # Fit your data on the scaler object
            X_pandas = scaler.fit_transform(X_pandas)
            #X_pandas = pd.DataFrame(scaled_df, columns=names)
         
        # STEP 1: split data between test and train sets
        X_train, X_test, y_train, y_test = train_test_split(X_pandas, np.ravel(y_pandas), test_size=0.3, random_state=12)
        
        #import matplotlib.pyplot as plt
        if i == 0: 
            print('classes count(before SMOTE)= ',(sum(y_train==0),sum(y_train==1),sum(y_train==2)))
            #pd.value_counts(y_pandas['Class']).plot.bar()
            #plt.title('Unbalanced Alzheimer class histogram')
            #plt.xlabel('Class')
            #plt.ylabel('Frequency')
   
        elif i == 1:
            #pd.value_counts(y_pandas['Class']).plot.bar()
            #plt.title('Balanced and Normalized Alzheimer class histogram')
            #plt.xlabel('Class')
            #plt.ylabel('Frequency')

            
        
            from imblearn.over_sampling import SMOTE
            smt = SMOTE()
            X_train, y_train = smt.fit_sample(X_train, y_train)
            #print('classes count(after SMOTE)=\n',np.bincount(y_train))
            print('classes count(after SMOTE)=',(sum(y_train==0),sum(y_train==1),sum(y_train==2)))
        
        
         # STEP 2: train the model on the training set
        knn = KNeighborsClassifier(n_neighbors=K_VALUE)
        knn.fit(X_train, y_train)
        
        # STEP 3: make predictions on the testing set
        y_pred = knn.predict(X_test)
        #if smote_debug: 
            #print('y_pred=\n',y_pred)
            #print('y_pred.shape:',y_pred.shape)
        
        # compare actual response values (y_test) with predicted response values (y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred) 
        confusion_matrix = metrics.confusion_matrix(y_test,y_pred,labels=None,sample_weight=None)
        
        print('Individual analysed: [{0}, {1}, {2}]'.format(bplane,start_slice,total_slices))
        print ('KNN Acurracy with K={0} was: {1}'.format(K_VALUE, accuracy))
        print('confusion matrix:\n',confusion_matrix)
    

    
    
    
    '''
    # STEP 1: split data between test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X_pandas, y_pandas, test_size=0.3, random_state=12)
    
    # print the shapes of the new X objects
    if smote_debug: 
        print('X_train.shape:', X_train.shape)
        print('X_test.shape:', X_test.shape)
    
    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)
    
    # print the shapes of the new y objects
    if smote_debug: 
        print('y_train.shape:',y_train.shape)
        print('y_test.shape:',y_test.shape)
    
    # STEP 1: adjust shape of y vectors
    np.ravel(y_train)
    
    # STEP 2: train the model on the training set
    knn = KNeighborsClassifier(n_neighbors=k_value)
    knn.fit(X_train, y_train)
    
    # STEP 3: make predictions on the testing set
    y_pred = knn.predict(X_test)
    if smote_debug: 
        print('y_pred=\n',y_pred)
        print('y_pred.shape:',y_pred.shape)
    
    # compare actual response values (y_test) with predicted response values (y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred) 
    confusion_matrix = metrics.confusion_matrix(y_test,y_pred,labels=None,sample_weight=None)
    '''

    
    
    return 0