def main(): getopts() # Read the physics dataset phy_data = get_data( '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # First method: Classify on the basis of distance from the mean. # This is a very primitive classifier phy_test_data = get_data( '/Users/nareshs/Documents/projects/base/datasets/phy_test.dat.zip', 'phy_test.dat') for class_id in phy_test_data: np_nd_arr = np.array(phy_test_data[class_id]) mean_arr = get_mean_arr(np_nd_arr) new_rows = [] for x in phy_test_data[class_id]: n = get_processed_obs(x, mean_arr) new_rows.append(n) phy_test_data[class_id] = new_rows fh = open('/tmp/test_mean_method.txt', 'w') cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: fh.write( str(cnt) + ' ' + str(get_class(class_2_mean_arr, arr)) + '\n') cnt = cnt + 1 fh.close()
def main(): getopts() # Read the physics dataset phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # First method: Classify on the basis of distance from the mean. # This is a very primitive classifier phy_test_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_test.dat.zip', 'phy_test.dat') for class_id in phy_test_data: np_nd_arr = np.array(phy_test_data[class_id]) mean_arr = get_mean_arr(np_nd_arr) new_rows = [] for x in phy_test_data[class_id]: n = get_processed_obs(x, mean_arr) new_rows.append(n) phy_test_data[class_id] = new_rows fh = open('/tmp/test_mean_method.txt', 'w') cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: fh.write(str(cnt) + ' ' + str(get_class(class_2_mean_arr, arr)) + '\n') cnt = cnt + 1 fh.close()
def main(): getopts() # Read the physics dataset phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr
def main(): getopts() # Read the physics dataset phy_data = get_data( '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr
def main(): getopts() # Read the physics dataset phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # Second method: Linear regression # Simply put, we will be calculating [b1, b2, b3 .. bn] such that # for the ith observation: # yi = b1 * x1i + b2 * x2i ... + bn * xni # where n is the number of distinct attributes # More details at: http://en.wikipedia.org/wiki/Regression_analysis # Derivation at: http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics) # y contains the class_ids corresponding to each observation y = [] # x contains all the observations x = [] for class_id in phy_data: for obs in phy_data[class_id]: y.append(int(class_id)) processed_obs = get_processed_obs(obs, class_2_mean_arr[class_id]) x.append(processed_obs) x = np.array(x) y = np.array(y) info('Created x and y np arrays') x_t = x.T info('Calculated x.T') xt_dot_y = x_t.dot(y) info('Calculated x_t.dot(y)') # we want to calculate (x_t * x)-1 * xt_dot_y x_t_x = x_t.dot(x) info('Calculated x_t_x') x_t_x_i = np.linalg.inv(x_t_x) info('Calculated inv(x_t_x)') b = x_t_x_i.dot(xt_dot_y) fh = open('/tmp/test_linalg.txt', 'w') cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: l = len(arr) arr = np.delete(np.array(arr).reshape(1, l), idxs, 1) val = np.sum(b * arr) fh.write(str(cnt) + ' ' + str(val) + '\n') cnt = cnt + 1 fh.close()
def main(): getopts() # Read the physics dataset phy_data = get_data( '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # Second method: Linear regression # Simply put, we will be calculating [b1, b2, b3 .. bn] such that # for the ith observation: # yi = b1 * x1i + b2 * x2i ... + bn * xni # where n is the number of distinct attributes # More details at: http://en.wikipedia.org/wiki/Regression_analysis # Derivation at: http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics) # y contains the class_ids corresponding to each observation y = [] # x contains all the observations x = [] for class_id in phy_data: for obs in phy_data[class_id]: y.append(int(class_id)) processed_obs = get_processed_obs(obs, class_2_mean_arr[class_id]) x.append(processed_obs) x = np.array(x) y = np.array(y) info('Created x and y np arrays') x_t = x.T info('Calculated x.T') xt_dot_y = x_t.dot(y) info('Calculated x_t.dot(y)') # we want to calculate (x_t * x)-1 * xt_dot_y x_t_x = x_t.dot(x) info('Calculated x_t_x') x_t_x_i = np.linalg.inv(x_t_x) info('Calculated inv(x_t_x)') b = x_t_x_i.dot(xt_dot_y) fh = open('/tmp/test_linalg.txt', 'w') cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: l = len(arr) arr = np.delete(np.array(arr).reshape(1, l), idxs, 1) val = np.sum(b * arr) fh.write(str(cnt) + ' ' + str(val) + '\n') cnt = cnt + 1 fh.close()
def main(): getopts() # Read the physics dataset phy_data = get_data( '/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # Third method: Perceptron learning algorithm # http://en.wikipedia.org/wiki/Perceptron#Learning_algorithm # http://page.mi.fu-berlin.de/rojas/neural/chapter/K4.pdf # More detailed: http://hagan.okstate.edu/4_Perceptron.pdf # Initialize the weights array w0 = [0.0] * (phy_data['1'][0].shape[0] + 1) w = [w0] info('Initialized weights array of length ' + str(phy_data['1'][0].shape[0])) # Initialize threshold gamma = 0.01 # Initialize learning rate alpha = 0.1 all_data = [] for class_id in phy_data: for row in phy_data[class_id]: row = [1.0] + list(row) all_data.append((class_id, np.array(row))) iter_num = 1 print 'Entering the while loop' latest_sum_of_errors = float('inf') while True: print datetime.now(), ': Iterating over all data. No.:', iter_num for row_num, row in enumerate(all_data): expected_val = float(row[0]) f_val = sum(w[-1] * row[1]) f_val = 1 if f_val > 0 else 0 #print datetime.now(), ': Getting new weights' w_new = [] for idx, x in enumerate(w[-1]): w_new.append(x + alpha * (expected_val - f_val) * row[1][idx]) #print datetime.now(), ': Got new weights' w.append(w_new) print datetime.now(), 'Done getting weights. Now checking the error' # Check the error! total_num_obs = len(all_data) sum_of_errors = 0.0 for test_row in all_data: expected_val = float(test_row[0]) f_val = sum(test_row[1] * w_new) f_val = 1 if f_val > 0 else 0 sum_of_errors += abs(expected_val - f_val) print datetime.now( ), ': Got sum of errors: ', sum_of_errors, ' for ', total_num_obs, ' data points' avg_error = sum_of_errors / total_num_obs if avg_error < gamma: break print 'After processing ', row_num, ' got avg_error', avg_error # Increase the iteration count iter_num = iter_num + 1 cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: l = len(arr) arr = np.delete(np.array(arr).reshape(1, l), idxs, 1) val = np.sum(w[-1] * arr) print cnt, val cnt = cnt + 1
def main(): getopts() # Read the physics dataset phy_data = get_data('/Users/nareshs/Documents/projects/base/datasets/phy_train.dat.zip', 'phy_train.dat') # Remove all columns with only zero as the value! idxs = [] for class_id in phy_data: sum_arr = np.sum(phy_data[class_id], axis=0) all_phy_data = np.array(phy_data[class_id]) for idx, val in enumerate(sum_arr): if abs(val) == 0.0: idxs.append(idx) mean_arr = get_mean_arr(np.array(phy_data[class_id])) idx = 0 for col in all_phy_data.T: if len(np.unique(col)) == 1 or (len(np.unique(col)) == 2 and get_col_outlier_value(idx) in col): idxs.append(idx) idx = idx + 1 idxs = list(set(idxs)) all_phy_data = np.delete(all_phy_data, idxs, 1) phy_data[class_id] = all_phy_data class_2_mean_arr = {} for class_id in phy_data: info('Processing data for class: ' + str(class_id)) np_nd_arr = phy_data[class_id] mean_arr = get_mean_arr(np_nd_arr) class_2_mean_arr[class_id] = mean_arr # Third method: Perceptron learning algorithm # http://en.wikipedia.org/wiki/Perceptron#Learning_algorithm # http://page.mi.fu-berlin.de/rojas/neural/chapter/K4.pdf # More detailed: http://hagan.okstate.edu/4_Perceptron.pdf # Initialize the weights array w0 = [0.0] * (phy_data['1'][0].shape[0] + 1) w = [w0] info('Initialized weights array of length ' + str(phy_data['1'][0].shape[0])) # Initialize threshold gamma = 0.01 # Initialize learning rate alpha = 0.1 all_data = [] for class_id in phy_data: for row in phy_data[class_id]: row = [1.0] + list(row) all_data.append((class_id, np.array(row))) iter_num = 1 print 'Entering the while loop' latest_sum_of_errors = float('inf') while True: print datetime.now(), ': Iterating over all data. No.:', iter_num for row_num, row in enumerate(all_data): expected_val = float(row[0]) f_val = sum(w[-1] * row[1]) f_val = 1 if f_val > 0 else 0 #print datetime.now(), ': Getting new weights' w_new = [] for idx, x in enumerate(w[-1]): w_new.append(x + alpha * (expected_val - f_val) * row[1][idx]) #print datetime.now(), ': Got new weights' w.append(w_new) print datetime.now(), 'Done getting weights. Now checking the error' # Check the error! total_num_obs = len(all_data) sum_of_errors = 0.0 for test_row in all_data: expected_val = float(test_row[0]) f_val = sum(test_row[1] * w_new) f_val = 1 if f_val > 0 else 0 sum_of_errors += abs(expected_val - f_val) print datetime.now(), ': Got sum of errors: ', sum_of_errors, ' for ', total_num_obs, ' data points' avg_error = sum_of_errors / total_num_obs if avg_error < gamma: break print 'After processing ', row_num, ' got avg_error', avg_error # Increase the iteration count iter_num = iter_num + 1 cnt = 50001 for x in phy_test_data: for arr in phy_test_data[x]: l = len(arr) arr = np.delete(np.array(arr).reshape(1, l), idxs, 1) val = np.sum(w[-1] * arr) print cnt, val cnt = cnt + 1