def main(input_file, adjective_file, train_feature_pkl, test_feature_pkl, ensemble_test_feature_pkl, all_classifiers_pkl, scaler_pkl, bolt_feature_obj_pkl): # Load data into the pipeline. First check # for feature object pkl files print "Loading data from file\n" # if train_feature_pkl == None or test_feature_pkl == None or ensemble_test_feature_pkl == None: if bolt_feature_obj_pkl == None: # If no features, load data from either an # h5 and adjective file or directly from # a saved pkl file if input_file.endswith(".h5"): all_data = loadDataFromH5File(input_file, adjective_file) else: all_data = utilities.loadBoltObjFile(input_file) print "Loaded data\n" """ # Remove the duplicated MDF_320, and save a new all_data.pkl all_data_new = dict() toremove = [290, 291, 292, 293, 294, 295, 296, 297, 298, 299] for motion_name in all_data: all_data_new[motion_name] = np.delete(all_data[motion_name], toremove) cPickle.dump(all_data_new, open("all_data.pkl", "w"), cPickle.HIGHEST_PROTOCOL) import pdb; pdb.set_trace() pass """ # Split the data by leaving one object out as ensemble_test_data for each time and cycle through all objects # Generate the stratifications(labels) for picking out a object obj_id_vector = [] for num in np.arange(len(all_data['tap'])): obj_id_vector.append(all_data['tap'][num].object_id) lol = cross_validation.LeaveOneLabelOut(np.array(obj_id_vector)) obj_id_list = np.unique(obj_id_vector).tolist() # We may pickle this cross validation generator "lol" later train_set = dict() ensemble_test_set = dict() for train_index, test_index in lol: print "TRAIN_INDEX: %s TEST_INDEX: %s" % (train_index, test_index) train_data = dict() ensemble_test_data = dict() for motion_name in all_data: train_data_array = np.array( all_data[motion_name])[train_index] ensemble_test_data_array = np.array(all_data[motion_name])[test_index] obj_id = ensemble_test_data_array[0].object_id train_data[motion_name] = train_data_array.tolist() ensemble_test_data[motion_name] = ensemble_test_data_array.tolist() train_set[obj_id] = train_data ensemble_test_set[obj_id] = ensemble_test_data #cPickle.dump(train_data, open("train_data_"+str(obj_id)+".pkl", "w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(ensemble_test_data,open("ensemble_test_data_"+%(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(train_set, open("train_set.pkl", "w"), cPickle.HIGHEST_PROTOCOL)) #cPickle.dump(ensemble_test_set, open("ensemble_test_set.pkl"), "w", cPickle.HIGHEST_PROTOCOL)) # Split the data into train and final test # train_data, ensemble_test_data = utilities.split_data(all_data, 0.9) for obj_id in train_set: # Split the train data again into train and test train_data, test_data = utilities.split_data(train_set[obj_id], 0.7) # Fit PCA for electrodes on training data print "Fitting PCA for electrode data\n" electrode_pca_dict = fit_electrodes_pca(train_data) # Store off PCA pkl cPickle.dump(electrode_pca_dict, open("pca_pkls/pca_"+str(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) print "PCA transforms stored as 'pca.pkl'\n" # Convert motion objects into feature objects print "Generating feature object dictionaries\n" train_all_features_obj_dict = BoltMotionObjToFeatureObj(train_data, electrode_pca_dict) test_all_features_obj_dict = BoltMotionObjToFeatureObj(test_data, electrode_pca_dict) ensemble_test_all_features_obj_dict = BoltMotionObjToFeatureObj(ensemble_test_data, electrode_pca_dict) # Store off feature object pkls cPickle.dump(train_all_features_obj_dict, open("train_pkls/train_feature_objs_"+str(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'train_feature_objs.pkl'\n" cPickle.dump(test_all_features_obj_dict, open("test_pkls/test_feature_objs_"+str(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'test_feature_objs.pkl'\n" cPickle.dump(ensemble_test_all_features_obj_dict, open("ensemble_pkls/ensemble_test_feature_objs_"+str(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'ensemble_test_feature_objs.pkl'\n" import pdb; pdb.set_trace() pass else: # Load pkl'd feature object dictionaries all_feature_obj_dict = cPickle.load(open(bolt_feature_obj_pkl,"r")) ''' train_all_features_obj_dict = cPickle.load(open(train_feature_pkl,"r")) test_all_features_obj_dict = cPickle.load(open(test_feature_pkl,"r")) ensemble_test_all_features_obj_dict = cPickle.load(open(ensemble_test_feature_pkl,"r")) ''' print "Loaded data\n" # 1st split: pick out 5 objects for final testing obj_leave_out = [101, 316, 702, 508, 601] five_test_feature_obj, all_train_feature_obj = PickOutObjects(all_feature_obj_dict, obj_leave_out) # 2nd split: pick 6 objects out for testing kNN/SVM classifiers and creating proba test_obj_leave_out = [315, 602, 115, 216, 213, 309] ensemble_train_feature_obj, train_feature_obj = PickOutObjects(all_train_feature_obj, test_obj_leave_out) # Specify feature to be extracted feature_name_list = ["pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc", "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min", "gripper_mean", "transform_distance", "electrode_polyfit"] if all_classifiers_pkl == None or scaler_pkl == None: # Pull desired features from feature objects train_feature_vector_dict, train_adjective_dict = bolt_obj_2_feature_vector(train_feature_obj, feature_name_list) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector(ensemble_train_feature_obj, feature_name_list) print("Created feature vector containing %s\n" % feature_name_list) # Create Scalers scaler_dict = create_scalers(train_feature_vector_dict) # Store off scaler dictionary cPickle.dump(scaler_dict, open("scaler.pkl","w"), cPickle.HIGHEST_PROTOCOL) print "Feature vector scalers stored as 'scaler.pkl'\n" # Run full train #all_knn_classifiers, all_svm_classifiers = full_train(train_feature_vector_dict, train_adjective_dict, test_feature_vector_dict, test_adjective_dict, scaler_dict) import pdb; pdb.set_trace() pass # Select which algorithm to use in the ensemble phase all_classifiers_dict = all_svm_classifiers else: # Load pkl'd classifiers, probabilities and scores all_classifiers_dict = cPickle.load(open(all_classifiers_pkl,"r")) # Load pkl'd scaler dictionary scaler_dict = cPickle.load(open(scaler_pkl,"r")) # Get test labels, to be used as ensemble train labels test_all_features_obj_dict = cPickle.load(open(test_feature_pkl,"r")) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector(test_all_features_obj_dict, feature_name_list) # Pull desired bolt features from ensemble test data ensemble_test_feature_vector_dict, ensemble_test_adjective_dict = bolt_obj_2_feature_vector(five_test_feature_obj_dict, feature_name_list) # Create ensemble feature vectors out of probabilities ensemble_train_feature_vector_dict, ensemble_test_feature_vector_dict = extract_ensemble_features(all_classifiers_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict, scaler_dict) # Ensemble train labels are previous test labels ensemble_train_adjective_dict = test_adjective_dict import pdb; pdb.set_trace() for adj in ensemble_train_adjective_dict: count = np.sum(ensemble_train_adjective_dict[adj]) import pdb; pdb.set_trace() print adj+": %d " %count # Remove the adjectives 'warm' and 'sparse' from the labels dictionaries del ensemble_train_adjective_dict['springy'] del ensemble_test_adjective_dict['springy'] del ensemble_train_adjective_dict['elastic'] del ensemble_test_adjective_dict['elastic'] del ensemble_train_adjective_dict['meshy'] del ensemble_test_adjective_dict['meshy'] del ensemble_train_adjective_dict['gritty'] del ensemble_test_adjective_dict['gritty'] del ensemble_train_adjective_dict['textured'] del ensemble_test_adjective_dict['textured'] del ensemble_train_adjective_dict['absorbant'] del ensemble_test_adjective_dict['absorbant'] del ensemble_train_adjective_dict['crinkly'] del ensemble_test_adjective_dict['crinkly'] del ensemble_train_adjective_dict['porous'] del ensemble_test_adjective_dict['porous'] del ensemble_train_adjective_dict['grainy'] del ensemble_test_adjective_dict['grainy'] del ensemble_train_adjective_dict['warm'] del ensemble_test_adjective_dict['warm'] del ensemble_train_adjective_dict['sparse'] del ensemble_test_adjective_dict['sparse'] # Combine motion-specific classifiers for each adjective all_ensemble_classifiers = full_ensemble_train(ensemble_train_feature_vector_dict, ensemble_train_adjective_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict) # Store off combined classifiers cPickle.dump(all_ensemble_classifiers, open("all_ensemble_classifiers.pkl","w"), cPickle.HIGHEST_PROTOCOL)
def main(input_file, adjective_file, train_feature_pkl, test_feature_pkl, ensemble_test_feature_pkl, all_classifiers_pkl, scaler_pkl, bolt_feature_obj_pkl): # Load data into the pipeline. First check # for feature object pkl files print "Loading data from file\n" # if train_feature_pkl == None or test_feature_pkl == None or ensemble_test_feature_pkl == None: if bolt_feature_obj_pkl == None: # If no features, load data from either an # h5 and adjective file or directly from # a saved pkl file if input_file.endswith(".h5"): all_data = loadDataFromH5File(input_file, adjective_file) else: all_data = utilities.loadBoltObjFile(input_file) print "Loaded data\n" """ # Remove the duplicated MDF_320, and save a new all_data.pkl all_data_new = dict() toremove = [290, 291, 292, 293, 294, 295, 296, 297, 298, 299] for motion_name in all_data: all_data_new[motion_name] = np.delete(all_data[motion_name], toremove) cPickle.dump(all_data_new, open("all_data.pkl", "w"), cPickle.HIGHEST_PROTOCOL) import pdb; pdb.set_trace() pass """ # Split the data by leaving one object out as ensemble_test_data for each time and cycle through all objects # Generate the stratifications(labels) for picking out a object obj_id_vector = [] for num in np.arange(len(all_data['tap'])): obj_id_vector.append(all_data['tap'][num].object_id) lol = cross_validation.LeaveOneLabelOut(np.array(obj_id_vector)) obj_id_list = np.unique(obj_id_vector).tolist() # We may pickle this cross validation generator "lol" later train_set = dict() ensemble_test_set = dict() for train_index, test_index in lol: print "TRAIN_INDEX: %s TEST_INDEX: %s" % (train_index, test_index) train_data = dict() ensemble_test_data = dict() for motion_name in all_data: train_data_array = np.array(all_data[motion_name])[train_index] ensemble_test_data_array = np.array( all_data[motion_name])[test_index] obj_id = ensemble_test_data_array[0].object_id train_data[motion_name] = train_data_array.tolist() ensemble_test_data[ motion_name] = ensemble_test_data_array.tolist() train_set[obj_id] = train_data ensemble_test_set[obj_id] = ensemble_test_data #cPickle.dump(train_data, open("train_data_"+str(obj_id)+".pkl", "w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(ensemble_test_data,open("ensemble_test_data_"+%(obj_id)+".pkl","w"), cPickle.HIGHEST_PROTOCOL) #cPickle.dump(train_set, open("train_set.pkl", "w"), cPickle.HIGHEST_PROTOCOL)) #cPickle.dump(ensemble_test_set, open("ensemble_test_set.pkl"), "w", cPickle.HIGHEST_PROTOCOL)) # Split the data into train and final test # train_data, ensemble_test_data = utilities.split_data(all_data, 0.9) for obj_id in train_set: # Split the train data again into train and test train_data, test_data = utilities.split_data( train_set[obj_id], 0.7) # Fit PCA for electrodes on training data print "Fitting PCA for electrode data\n" electrode_pca_dict = fit_electrodes_pca(train_data) # Store off PCA pkl cPickle.dump(electrode_pca_dict, open("pca_pkls/pca_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "PCA transforms stored as 'pca.pkl'\n" # Convert motion objects into feature objects print "Generating feature object dictionaries\n" train_all_features_obj_dict = BoltMotionObjToFeatureObj( train_data, electrode_pca_dict) test_all_features_obj_dict = BoltMotionObjToFeatureObj( test_data, electrode_pca_dict) ensemble_test_all_features_obj_dict = BoltMotionObjToFeatureObj( ensemble_test_data, electrode_pca_dict) # Store off feature object pkls cPickle.dump( train_all_features_obj_dict, open("train_pkls/train_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'train_feature_objs.pkl'\n" cPickle.dump( test_all_features_obj_dict, open("test_pkls/test_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'test_feature_objs.pkl'\n" cPickle.dump( ensemble_test_all_features_obj_dict, open( "ensemble_pkls/ensemble_test_feature_objs_" + str(obj_id) + ".pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature object dictionary stored as 'ensemble_test_feature_objs.pkl'\n" import pdb pdb.set_trace() pass else: # Load pkl'd feature object dictionaries all_feature_obj_dict = cPickle.load(open(bolt_feature_obj_pkl, "r")) ''' train_all_features_obj_dict = cPickle.load(open(train_feature_pkl,"r")) test_all_features_obj_dict = cPickle.load(open(test_feature_pkl,"r")) ensemble_test_all_features_obj_dict = cPickle.load(open(ensemble_test_feature_pkl,"r")) ''' print "Loaded data\n" # 1st split: pick out 5 objects for final testing obj_leave_out = [101, 316, 702, 508, 601] five_test_feature_obj, all_train_feature_obj = PickOutObjects( all_feature_obj_dict, obj_leave_out) # 2nd split: pick 6 objects out for testing kNN/SVM classifiers and creating proba test_obj_leave_out = [315, 602, 115, 216, 213, 309] ensemble_train_feature_obj, train_feature_obj = PickOutObjects( all_train_feature_obj, test_obj_leave_out) # Specify feature to be extracted feature_name_list = [ "pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc", "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit", "gripper_min", "gripper_mean", "transform_distance", "electrode_polyfit" ] if all_classifiers_pkl == None or scaler_pkl == None: # Pull desired features from feature objects train_feature_vector_dict, train_adjective_dict = bolt_obj_2_feature_vector( train_feature_obj, feature_name_list) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector( ensemble_train_feature_obj, feature_name_list) print("Created feature vector containing %s\n" % feature_name_list) # Create Scalers scaler_dict = create_scalers(train_feature_vector_dict) # Store off scaler dictionary cPickle.dump(scaler_dict, open("scaler.pkl", "w"), cPickle.HIGHEST_PROTOCOL) print "Feature vector scalers stored as 'scaler.pkl'\n" # Run full train #all_knn_classifiers, all_svm_classifiers = full_train(train_feature_vector_dict, train_adjective_dict, test_feature_vector_dict, test_adjective_dict, scaler_dict) import pdb pdb.set_trace() pass # Select which algorithm to use in the ensemble phase all_classifiers_dict = all_svm_classifiers else: # Load pkl'd classifiers, probabilities and scores all_classifiers_dict = cPickle.load(open(all_classifiers_pkl, "r")) # Load pkl'd scaler dictionary scaler_dict = cPickle.load(open(scaler_pkl, "r")) # Get test labels, to be used as ensemble train labels test_all_features_obj_dict = cPickle.load(open(test_feature_pkl, "r")) test_feature_vector_dict, test_adjective_dict = bolt_obj_2_feature_vector( test_all_features_obj_dict, feature_name_list) # Pull desired bolt features from ensemble test data ensemble_test_feature_vector_dict, ensemble_test_adjective_dict = bolt_obj_2_feature_vector( five_test_feature_obj_dict, feature_name_list) # Create ensemble feature vectors out of probabilities ensemble_train_feature_vector_dict, ensemble_test_feature_vector_dict = extract_ensemble_features( all_classifiers_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict, scaler_dict) # Ensemble train labels are previous test labels ensemble_train_adjective_dict = test_adjective_dict import pdb pdb.set_trace() for adj in ensemble_train_adjective_dict: count = np.sum(ensemble_train_adjective_dict[adj]) import pdb pdb.set_trace() print adj + ": %d " % count # Remove the adjectives 'warm' and 'sparse' from the labels dictionaries del ensemble_train_adjective_dict['springy'] del ensemble_test_adjective_dict['springy'] del ensemble_train_adjective_dict['elastic'] del ensemble_test_adjective_dict['elastic'] del ensemble_train_adjective_dict['meshy'] del ensemble_test_adjective_dict['meshy'] del ensemble_train_adjective_dict['gritty'] del ensemble_test_adjective_dict['gritty'] del ensemble_train_adjective_dict['textured'] del ensemble_test_adjective_dict['textured'] del ensemble_train_adjective_dict['absorbant'] del ensemble_test_adjective_dict['absorbant'] del ensemble_train_adjective_dict['crinkly'] del ensemble_test_adjective_dict['crinkly'] del ensemble_train_adjective_dict['porous'] del ensemble_test_adjective_dict['porous'] del ensemble_train_adjective_dict['grainy'] del ensemble_test_adjective_dict['grainy'] del ensemble_train_adjective_dict['warm'] del ensemble_test_adjective_dict['warm'] del ensemble_train_adjective_dict['sparse'] del ensemble_test_adjective_dict['sparse'] # Combine motion-specific classifiers for each adjective all_ensemble_classifiers = full_ensemble_train( ensemble_train_feature_vector_dict, ensemble_train_adjective_dict, ensemble_test_feature_vector_dict, ensemble_test_adjective_dict) # Store off combined classifiers cPickle.dump(all_ensemble_classifiers, open("all_ensemble_classifiers.pkl", "w"), cPickle.HIGHEST_PROTOCOL)
def main(input_file, adjective_file, train_feature_pkl, test_feature_plk): print "Loading data from file" # If no features, load data from either an # h5 and adjective file or directly from # a saved pkl file if input_file.endswith(".h5"): all_data = loadDataFromH5File(input_file, adjective_file) else: all_data = utilities.loadBoltObjFile(input_file) print "loaded data" # Split the data into train and test train_data, test_data = utilities.split_data(all_data, 0.9) # Convert motion objects into feature objects test_all_features_obj_dict = BoltMotionObjToFeatureObj(test_data) print "loaded data" # Take loaded data and extract out features feature_name_list = ["pdc_rise_count", "pdc_area", "pdc_max", "pac_energy", "pac_sc", "pac_sv", "pac_ss", "pac_sk", "tac_area", "tdc_exp_fit"] # Pull desired features from feature objects test_feature_vector, test_adjective_dictionary = bolt_obj_2_feature_vector(test_all_features_obj_dict, feature_name_list) # Preprocess the data by scaling test_feature_vector_scaled = preprocessing.scale(test_feature_vector) print("Created feature vector containing %s" % feature_name_list) report_file = open("Test_results.txt","a") results = dict() # adjective_list has NOT been created for adj in test_adjective_dictionary print "Start testing on adjective %s" %(adj) labels = dict() knn_clf_ptr = open('adjective_classifiers/'+adj+'_knn.pkl', "r") svm_clf_ptr = open('adjective_classifiers/'+adj+'_svm.pkl', "r") # Load the pickle file which is the corresponding adjective classifier adj_clf_knn = cPickle.load(knn_clf_ptr) adj_clf_svm = cPickle.load(svm_clf_ptr) report_file.write('----- Adjective: ') report_file.write(adj) report_file.write(' -----\n') for motion_name in test_feature_vector knn_predicted = adj_clf_knn[motion_name].predict_proba(test_feature_vector_scaled[motion_name]) svm_predicted = adj_clf_svm[motion_name].predict_proba(test_feature_vector_scaled[motion_name]) report_file.write('Motion: '+motion_name+'\n') # Is proba a list of float values?? report_file.write('KNN labels with proba: ') report_file.write('SVM labels with proba: ') labels[motion_name] = [knn_predicted, svm_predicted] results[adj] = labels # In the future, we may store the results by motion in the order how well it performs print "Tesing on adjective %s is DONE" %(adj) file_name = "test_result.pkl" cPickle.dump(results, open(file_name, "w"), cPickle.HIGHEST_PROTOCOL) # Use the output from classifiers by motions to create a single classifier for each adjective final_classifier = AdjectiveClassifiers(test_adjective_dictionary, test_feature_vector)