def feature_selection(self,data_set,feature_selector,fit=True): """ Perform feature selection. Must be done before loading testing sets :param feat_selector: :return: """ assert hasattr(feature_selector,"transform") clustering_logger.info("Pre feature selection: num features: {}".format(data_set.X.shape[1])) if fit: X,y,ref_id=data_set.X,data_set.y,data_set.ref_index flatten_train_set(data_set) feature_selector.fit(data_set.X,data_set.y) data_set.X,data_set.y,data_set.ref_index=X,y,ref_id train_X=feature_selector.transform(data_set.X) clustering_logger.info("Post feature selection: num features: {}".format(train_X.shape[1])) data_set.X=train_X
def load_training_testing(Xs,ys,ref_indexes,settings,train_set_index,test_set_index): """ Load training and testing set based on indexes provided by crossvalidation :return: List of train_set and test_set objs """ train_sets=[] test_sets=[] for c,setting in enumerate(settings): train_set=Training(setting,pickle_dir=setting.pickle_dir) train_set.X=Xs[c][train_set_index] train_set.y=ys[c][train_set_index] train_set.ref_index=ref_indexes[c][train_set_index] test_set=Testing(setting,pickle_dir=setting.pickle_dir) test_set.X=Xs[c][test_set_index] test_set.y=ys[c][test_set_index] test_set.ref_index=ref_indexes[c][test_set_index] train_sets.append(train_set) test_sets.append(test_set) #flatten training for train_set in train_sets: flatten_train_set(train_set) #make sure the sets match classification_logger.info("Checking the sets match") ys=[train_set.y for train_set in train_sets] ref_indexes=[train_set.ref_index for train_set in train_sets] test_ys=np.array([test_set.y for test_set in test_sets]) test_ref_indexes=[test_set.ref_index for test_set in test_sets] for c,elem in enumerate((ys,ref_indexes,test_ys,test_ref_indexes)): prev=elem[0] match=True for e in elem[1:]: match=match and (e==prev).all() if not match: raise AttributeError("NOT MATCH FOR {} ELEMENT".format(c)) return train_sets,test_sets # def select_training_testing_sets(settings,Xs,y,ref_index,num,do_pickle=True): # """ # Randomly choose from a super set of data and split it into a training set of size num. The remainder will become # the Test set. Uses _pick_random_samples # # :param setting: # :param X: # :param y: # :param ref_index: # :param num: # :return: tuple_training,tuple_testing # """ # # selector=np.not_equal(ref_index,None) # ref_index=ref_index[selector] # Xs=[X[selector] for X in Xs] # y=y[selector] # # train_Xs,train_y,train_ref_index,test_Xs,test_y,test_ref_index=_pick_random_samples(Xs,y,ref_index,num) # # train_objs=[] # test_objs=[] # if do_pickle: # for c,setting in enumerate(settings): # train_X=train_Xs[c] # test_X=test_Xs[c] # # _pickle_training_testing(setting,train_X,train_y,train_ref_index,test_X,test_y,test_ref_index) # # training_obj=Training(label=setting,pickle_dir=setting.pickle_dir) # training_obj.set_data(train_X,train_y,train_ref_index) # # testing_obj=Testing(label=setting,pickle_dir=setting.pickle_dir) # testing_obj.set_data(test_X,test_y,test_ref_index) # # train_objs.append(training_obj) # test_objs.append(testing_obj) # # return train_objs,test_objs