Python get_data 예제들, data_manipulation.kdd_04.get_data Python 예제들

예제 #1

0

파일 보기

파일: kdd_04_mean.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data(
        '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip',
        'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and
                                            get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr = phy_data[class_id]
        mean_arr = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr

    # First method: Classify on the basis of distance from the mean.
    # This is a very primitive classifier
    phy_test_data = get_data(
        '/Users/nareshs/Documents/projects/base/datasets/phy_test.dat.zip',
        'phy_test.dat')
    for class_id in phy_test_data:
        np_nd_arr = np.array(phy_test_data[class_id])
        mean_arr = get_mean_arr(np_nd_arr)
        new_rows = []
        for x in phy_test_data[class_id]:
            n = get_processed_obs(x, mean_arr)
            new_rows.append(n)

        phy_test_data[class_id] = new_rows

    fh = open('/tmp/test_mean_method.txt', 'w')
    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            fh.write(
                str(cnt) + ' ' + str(get_class(class_2_mean_arr, arr)) + '\n')
            cnt = cnt + 1
    fh.close()

예제 #2

0

파일 보기

파일: kdd_04_mean.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0 
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1 
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data
    

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr  = phy_data[class_id]
        mean_arr   = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr
    
    # First method: Classify on the basis of distance from the mean.
    # This is a very primitive classifier
    phy_test_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_test.dat.zip', 'phy_test.dat')
    for class_id in phy_test_data:
        np_nd_arr  = np.array(phy_test_data[class_id])
        mean_arr   = get_mean_arr(np_nd_arr)
        new_rows = []
        for x in phy_test_data[class_id]:
            n = get_processed_obs(x, mean_arr)
            new_rows.append(n)

        phy_test_data[class_id] = new_rows
    
    fh = open('/tmp/test_mean_method.txt', 'w')
    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            fh.write(str(cnt) + ' ' + str(get_class(class_2_mean_arr, arr)) + '\n')
            cnt = cnt + 1
    fh.close()

예제 #3

0

파일 보기

파일: kdd_04_svm.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0 
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1 
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data
    

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr  = phy_data[class_id]
        mean_arr   = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr

예제 #4

0

파일 보기

파일: kdd_04_svm.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data(
        '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip',
        'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and
                                            get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr = phy_data[class_id]
        mean_arr = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr

예제 #5

0

파일 보기

파일: kdd_04_linear_regression.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0 
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1 
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data
    

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr  = phy_data[class_id]
        mean_arr   = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr
    # Second method: Linear regression
    # Simply put, we will be calculating [b1, b2, b3 .. bn] such that
    # for the ith observation:
    # yi = b1 * x1i + b2 * x2i ... + bn * xni
    # where n is the number of distinct attributes
    # More details at: http://en.wikipedia.org/wiki/Regression_analysis
    # Derivation at: http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)

    # y contains the class_ids corresponding to each observation
    y = []
    # x contains all the observations 
    x = []
    for class_id in phy_data:
        for obs in phy_data[class_id]:
            y.append(int(class_id))
            processed_obs = get_processed_obs(obs, class_2_mean_arr[class_id])
            x.append(processed_obs)

    x = np.array(x)
    y = np.array(y)
    info('Created x and y np arrays')
    x_t      = x.T
    info('Calculated x.T')
    xt_dot_y = x_t.dot(y)
    info('Calculated x_t.dot(y)')

    # we want to calculate (x_t * x)-1 * xt_dot_y
    x_t_x = x_t.dot(x)
    info('Calculated x_t_x')
    
    x_t_x_i = np.linalg.inv(x_t_x)
    info('Calculated inv(x_t_x)')

    b = x_t_x_i.dot(xt_dot_y)

    fh = open('/tmp/test_linalg.txt', 'w')
    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            l = len(arr)
            arr = np.delete(np.array(arr).reshape(1, l), idxs, 1)
            val = np.sum(b * arr)
            fh.write(str(cnt) + ' ' + str(val) + '\n')
            cnt = cnt + 1
    fh.close()

예제 #6

0

파일 보기

파일: kdd_04_linear_regression.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data(
        '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip',
        'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and
                                            get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr = phy_data[class_id]
        mean_arr = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr
    # Second method: Linear regression
    # Simply put, we will be calculating [b1, b2, b3 .. bn] such that
    # for the ith observation:
    # yi = b1 * x1i + b2 * x2i ... + bn * xni
    # where n is the number of distinct attributes
    # More details at: http://en.wikipedia.org/wiki/Regression_analysis
    # Derivation at: http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)

    # y contains the class_ids corresponding to each observation
    y = []
    # x contains all the observations
    x = []
    for class_id in phy_data:
        for obs in phy_data[class_id]:
            y.append(int(class_id))
            processed_obs = get_processed_obs(obs, class_2_mean_arr[class_id])
            x.append(processed_obs)

    x = np.array(x)
    y = np.array(y)
    info('Created x and y np arrays')
    x_t = x.T
    info('Calculated x.T')
    xt_dot_y = x_t.dot(y)
    info('Calculated x_t.dot(y)')

    # we want to calculate (x_t * x)-1 * xt_dot_y
    x_t_x = x_t.dot(x)
    info('Calculated x_t_x')

    x_t_x_i = np.linalg.inv(x_t_x)
    info('Calculated inv(x_t_x)')

    b = x_t_x_i.dot(xt_dot_y)

    fh = open('/tmp/test_linalg.txt', 'w')
    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            l = len(arr)
            arr = np.delete(np.array(arr).reshape(1, l), idxs, 1)
            val = np.sum(b * arr)
            fh.write(str(cnt) + ' ' + str(val) + '\n')
            cnt = cnt + 1
    fh.close()

예제 #7

0

파일 보기

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data(
        '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip',
        'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and
                                            get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr = phy_data[class_id]
        mean_arr = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr

    # Third method: Perceptron learning algorithm
    # http://en.wikipedia.org/wiki/Perceptron#Learning_algorithm
    # http://page.mi.fu-berlin.de/rojas/neural/chapter/K4.pdf
    # More detailed: http://hagan.okstate.edu/4_Perceptron.pdf

    # Initialize the weights array
    w0 = [0.0] * (phy_data['1'][0].shape[0] + 1)
    w = [w0]
    info('Initialized weights array of length ' +
         str(phy_data['1'][0].shape[0]))

    # Initialize threshold
    gamma = 0.01

    # Initialize learning rate
    alpha = 0.1

    all_data = []
    for class_id in phy_data:
        for row in phy_data[class_id]:
            row = [1.0] + list(row)
            all_data.append((class_id, np.array(row)))

    iter_num = 1
    print 'Entering the while loop'
    latest_sum_of_errors = float('inf')
    while True:
        print datetime.now(), ': Iterating over all data. No.:', iter_num
        for row_num, row in enumerate(all_data):
            expected_val = float(row[0])

            f_val = sum(w[-1] * row[1])
            f_val = 1 if f_val > 0 else 0
            #print datetime.now(), ': Getting new weights'
            w_new = []
            for idx, x in enumerate(w[-1]):
                w_new.append(x + alpha * (expected_val - f_val) * row[1][idx])
            #print datetime.now(), ': Got new weights'
            w.append(w_new)

        print datetime.now(), 'Done getting weights. Now checking the error'

        # Check the error!
        total_num_obs = len(all_data)
        sum_of_errors = 0.0
        for test_row in all_data:
            expected_val = float(test_row[0])
            f_val = sum(test_row[1] * w_new)
            f_val = 1 if f_val > 0 else 0
            sum_of_errors += abs(expected_val - f_val)
        print datetime.now(
        ), ': Got sum of errors: ', sum_of_errors, ' for ', total_num_obs, ' data points'
        avg_error = sum_of_errors / total_num_obs

        if avg_error < gamma:
            break
        print 'After processing ', row_num, ' got avg_error', avg_error
        # Increase the iteration count
        iter_num = iter_num + 1

    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            l = len(arr)
            arr = np.delete(np.array(arr).reshape(1, l), idxs, 1)
            val = np.sum(w[-1] * arr)
            print cnt, val
            cnt = cnt + 1

예제 #8

0

파일 보기

파일: kdd_04_perceptron.py 프로젝트: nareshenoy/base

def main():

    getopts()
    # Read the physics dataset
    phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat')
    # Remove all columns with only zero as the value!
    idxs = []
    for class_id in phy_data:
        sum_arr = np.sum(phy_data[class_id], axis=0)
        all_phy_data = np.array(phy_data[class_id])
        for idx, val in enumerate(sum_arr):
            if abs(val) == 0.0:
                idxs.append(idx)
        mean_arr = get_mean_arr(np.array(phy_data[class_id]))
        idx = 0 
        for col in all_phy_data.T:
            if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col):
                idxs.append(idx)
            idx = idx + 1 
        idxs = list(set(idxs))
        all_phy_data = np.delete(all_phy_data, idxs, 1)
        phy_data[class_id] = all_phy_data
    

    class_2_mean_arr = {}
    for class_id in phy_data:
        info('Processing data for class: ' + str(class_id))
        np_nd_arr  = phy_data[class_id]
        mean_arr   = get_mean_arr(np_nd_arr)
        class_2_mean_arr[class_id] = mean_arr
    
    # Third method: Perceptron learning algorithm
    # http://en.wikipedia.org/wiki/Perceptron#Learning_algorithm
    # http://page.mi.fu-berlin.de/rojas/neural/chapter/K4.pdf
    # More detailed: http://hagan.okstate.edu/4_Perceptron.pdf
    
    # Initialize the weights array
    w0 = [0.0] * (phy_data['1'][0].shape[0] + 1)
    w = [w0]
    info('Initialized weights array of length ' + str(phy_data['1'][0].shape[0]))

    # Initialize threshold
    gamma = 0.01
    
    # Initialize learning rate
    alpha = 0.1

    all_data = []
    for class_id in phy_data:
        for row in phy_data[class_id]:
            row = [1.0] + list(row)
            all_data.append((class_id, np.array(row)))

    iter_num = 1
    print 'Entering the while loop'
    latest_sum_of_errors = float('inf')
    while True:
        print datetime.now(), ': Iterating over all data. No.:', iter_num
        for row_num, row in enumerate(all_data):
            expected_val = float(row[0])
            
            f_val = sum(w[-1] * row[1])
            f_val = 1 if f_val > 0 else 0
            #print datetime.now(), ': Getting new weights'
            w_new = []
            for idx, x in enumerate(w[-1]):
                w_new.append(x + alpha * (expected_val - f_val) * row[1][idx])
            #print datetime.now(), ': Got new weights'
            w.append(w_new)

        print datetime.now(), 'Done getting weights. Now checking the error'

        # Check the error!
        total_num_obs = len(all_data)
        sum_of_errors = 0.0
        for test_row in all_data:
            expected_val = float(test_row[0])
            f_val        = sum(test_row[1] * w_new)
            f_val        = 1 if f_val > 0 else 0
            sum_of_errors += abs(expected_val - f_val)
        print datetime.now(), ': Got sum of errors: ', sum_of_errors, ' for ', total_num_obs, ' data points'
        avg_error = sum_of_errors / total_num_obs
        
        if avg_error < gamma:
          break
        print 'After processing ', row_num, ' got avg_error', avg_error            
        # Increase the iteration count
        iter_num = iter_num + 1

    cnt = 50001
    for x in phy_test_data:
        for arr in phy_test_data[x]:
            l = len(arr)
            arr = np.delete(np.array(arr).reshape(1, l), idxs, 1)
            val = np.sum(w[-1] * arr)
            print cnt, val
            cnt = cnt + 1