def test_default_training_set_based_feature_selection_for_raw_fingerprint_representations_of_training_and_test_set(self): ############################## print 'Running unittests for this project: ', project_name print 'Running this unittest: ', self._testMethodName ################################## from ml_input_utils import descriptorsFilesProcessor from ml_functions import filter_features_for_svmlight_format_files id2TrainClass = {'mA':1,'mB':1,'mC':0,'mD':1,'mE':0,'mG':0,'mF':0,'mH':0} #trying to make sure (in train_fp_file) one feature (f1) is only found in class 1, not class 0, hence it should be selected, but that this feature (f1) is not found in the test set! id2TestClass = {'mX':1,'mY':1,'mZ':1} #Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed. train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) print 'Preparing original files (pre-feature selection) in svmlight format.' our_descriptorsFilesProcessor = descriptorsFilesProcessor() record_of_all_feat2IndexFiles = [None] for TRAIN_OR_TEST_LABEL in ['Train','Test']: if 'Train' == TRAIN_OR_TEST_LABEL: id2class = id2TrainClass fp_file = train_fp_file all_feats_svmlight_file = all_feats_svmlight_train_file else: assert 'Test' == TRAIN_OR_TEST_LABEL id2class = id2TestClass fp_file = test_fp_file all_feats_svmlight_file = all_feats_svmlight_test_file record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles) del our_descriptorsFilesProcessor print 'PREPARED original files (pre-feature selection) in svmlight format.' filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,number_of_features_to_retain=2) filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_chi2_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) training_set_feature_name_to_feature_index_file = r'%s\contrived_fp_train_file_fpFeat2InitialIndex.csv' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) all_input_files_required_for_unittesting = [train_fp_file,test_fp_file] all_orig_output_files_to_be_compared_as_required_for_unittesting = [] for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file,training_set_feature_name_to_feature_index_file]: file_ext = new_file.split('.')[-1] orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file) all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file) self.compareOriginalAndNewFiles(orig_file,new_file) files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
def test_15_convert_svmlight_to_csv(self): ############################## print 'Running unittests for this project: ', project_name print 'Running this unittest: ', self._testMethodName ################################## do_not_to_delete = glob.glob(r'%s\*' % current_dir) import ml_input_utils ########### #c.f. generate_modelling_input.py: descriptorsFilesProcessorInstance = ml_input_utils.descriptorsFilesProcessor() for svmlight_file in [r'%s\test15_t14copied_svmlight_REG_train_file_nonDefault_fs_f_regression_top_1.txt' % current_dir,r'%s\test15_t14copied_svmlight_REG_test_file_nonDefault_fs_f_regression_top_1.txt' % current_dir]: csv_file = re.sub('(\.txt$)','.csv',svmlight_file) descriptorsFilesProcessorInstance.convert_svmlight_to_csv(svmlight_file,csv_file) ########### self.compareAllExpectedAndActualFiles(current_dir) self.clean_up_if_all_checks_passed(current_dir,specific_files_not_to_delete=do_not_to_delete)
def test_convert_svmlight_to_csv(self): ############################## print "Running unittests for this project: ", project_name print "Running this unittest: ", self._testMethodName ################################## # Note to self: BELOW copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py # from descriptor_utils import descriptorsFilesProcessor #Note to self - replaced this with following line from ml_input_utils import descriptorsFilesProcessor svmlight_file = r"%s\contrived_svmlight_train_file_fs_chi2_top_2.txt" % "\\".join( os.path.abspath(__file__).split("\\")[:-1] ) our_descriptorsFilesProcessor = descriptorsFilesProcessor() our_descriptorsFilesProcessor.convert_svmlight_to_csv(svmlight_file) del our_descriptorsFilesProcessor # Note to self: ABOVE copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py del descriptorsFilesProcessor csv_file = r"%s\contrived_svmlight_train_file_fs_chi2_top_2.csv" % "\\".join( os.path.abspath(__file__).split("\\")[:-1] ) all_input_files_required_for_unittesting = [svmlight_file] all_orig_output_files_to_be_compared_as_required_for_unittesting = [] for new_file in [csv_file]: file_ext = new_file.split(".")[-1] orig_file = re.sub("(\.%s$)" % file_ext, " - Copy.%s" % file_ext, new_file) all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file) self.compareOriginalAndNewFiles(orig_file, new_file) files_not_to_delete = ( all_input_files_required_for_unittesting + all_orig_output_files_to_be_compared_as_required_for_unittesting ) self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)
def test_convert_svmlight_to_csv(self): ############################## print 'Running unittests for this project: ', project_name print 'Running this unittest: ', self._testMethodName ################################## #Note to self: BELOW copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py #from descriptor_utils import descriptorsFilesProcessor #Note to self - replaced this with following line from ml_input_utils import descriptorsFilesProcessor svmlight_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.txt' % "\\".join( os.path.abspath(__file__).split('\\')[:-1]) our_descriptorsFilesProcessor = descriptorsFilesProcessor() our_descriptorsFilesProcessor.convert_svmlight_to_csv(svmlight_file) del our_descriptorsFilesProcessor #Note to self: ABOVE copied verbatim from trial_runs\descriptor_utils\..\test_svmlight_2_csv.py del descriptorsFilesProcessor csv_file = r'%s\contrived_svmlight_train_file_fs_chi2_top_2.csv' % "\\".join( os.path.abspath(__file__).split('\\')[:-1]) all_input_files_required_for_unittesting = [svmlight_file] all_orig_output_files_to_be_compared_as_required_for_unittesting = [] for new_file in [csv_file]: file_ext = new_file.split('.')[-1] orig_file = re.sub('(\.%s$)' % file_ext, ' - Copy.%s' % file_ext, new_file) all_orig_output_files_to_be_compared_as_required_for_unittesting.append( orig_file) self.compareOriginalAndNewFiles(orig_file, new_file) files_not_to_delete = all_input_files_required_for_unittesting + all_orig_output_files_to_be_compared_as_required_for_unittesting self.clean_up_if_all_checks_passed( specific_files_not_to_delete=files_not_to_delete)
def test_univariate_training_set_based_feature_selection_for_raw_fp_representations_of_training_and_test_set_for_a_REGRESSION_dataset(self): ############################### #17/03/13: #<N.B.: Using exactly the same input files as per test_4.> #<N.B.: For first run, did not clean up output files (which were copied to give the file copies to compare with in later test runs) and turned off comparison to file copies.> ############################### ############################## print 'Running unittests for this project: ', project_name print 'Running this unittest: ', self._testMethodName ################################## #Note to self: BELOW taken verbatim from ..\trial_runs\...\trial_run_fs_2.py #from descriptor_utils import descriptorsGenerator,descriptorsFilesProcessor #Note to self: replaced with the next line. from ml_input_utils import descriptorsFilesProcessor from ml_functions import filter_features_for_svmlight_format_files,f_regression id2TrainYValue = {'mA':1.8,'mB':1.8,'mC':0.1,'mD':1.8,'mE':0.1,'mG':1.8,'mF':0.1,'mH':1.8} #17/03/13: trying to make sure f2 has a perfect correlation with y-values in training set bu no such correlation [due to constant y-values] in the test set (see next line). <<DONE>:D.I.P.T.R> id2TestYValue = {'mX':1.0,'mY':1.0,'mZ':1.0} #Note to self: as ever, following file names need to be adjusted to make sure files in the directory of this test code Python file are parsed. train_fp_file = r'%s\contrived_fp_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) test_fp_file = r'%s\contrived_fp_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) all_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) all_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) print 'Preparing original files (pre-feature selection) in svmlight format.' our_descriptorsFilesProcessor = descriptorsFilesProcessor() record_of_all_feat2IndexFiles = [None] for TRAIN_OR_TEST_LABEL in ['Train','Test']: if 'Train' == TRAIN_OR_TEST_LABEL: id2class = id2TrainYValue fp_file = train_fp_file all_feats_svmlight_file = all_feats_svmlight_train_file else: assert 'Test' == TRAIN_OR_TEST_LABEL id2class = id2TestYValue fp_file = test_fp_file all_feats_svmlight_file = all_feats_svmlight_test_file record_of_all_feat2IndexFiles = our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(list_of_descriptors_files=[fp_file],corresponding_list_of_whether_descriptors_file_is_actually_a_raw_fp_file=[True],corresponding_list_of_whether_descriptors_file_is_actually_a_jCompoundMapperStringFeatures_file=[False],descriptors_file_name=all_feats_svmlight_file,id2responseVariable=id2class,corresponding_list_of_unique_features_files=record_of_all_feat2IndexFiles) del our_descriptorsFilesProcessor print 'PREPARED original files (pre-feature selection) in svmlight format.' ###### #<10/10/12::16:45: N.B.: INSPECTION OF ABOVE OUTPUT => our_descriptorsFilesProcessor.write_svmlight_format_modellingFile_from_multiple_descriptors_files(...) WORKS!> ###### filter_features_for_svmlight_format_files(svmlight_format_train_file=all_feats_svmlight_train_file,svmlight_format_test_file=all_feats_svmlight_test_file,univariate_scoring_function=f_regression,number_of_features_to_retain=2) #Note to self: ABOVE taken verbatim from ..\trial_runs\...\trial_run_fs_2.py filtered_feats_svmlight_train_file = r'%s\contrived_svmlight_train_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) filtered_feats_svmlight_test_file = r'%s\contrived_svmlight_test_file_fs_f_regression_top_2.txt' % "\\".join(os.path.abspath(__file__).split('\\')[:-1]) #23/03/13: commented out below for first trial runs and then, when output looked as expected<ok>, copied output and re-ran test with the following uncommented: all_input_files_required_for_unittesting = [train_fp_file,test_fp_file] all_orig_output_files_to_be_compared_as_required_for_unittesting = [] for new_file in [all_feats_svmlight_train_file,all_feats_svmlight_test_file,filtered_feats_svmlight_train_file,filtered_feats_svmlight_test_file]: file_ext = new_file.split('.')[-1] orig_file = re.sub('(\.%s$)' % file_ext,' - Copy.%s' % file_ext,new_file) all_orig_output_files_to_be_compared_as_required_for_unittesting.append(orig_file) self.compareOriginalAndNewFiles(orig_file,new_file) files_not_to_delete = all_input_files_required_for_unittesting+all_orig_output_files_to_be_compared_as_required_for_unittesting self.clean_up_if_all_checks_passed(specific_files_not_to_delete=files_not_to_delete)