def loadLabel(self, filename, verbose=True): ''' Get the solution/truth values''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint(verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'task' not in self.info.keys(): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info['task'] == 'multilabel.classification': label = data_io.data(filename) elif self.info['task'] == 'multiclass.classification': label = data_converter.convert_to_num(data_io.data(filename)) else: label = np.ravel(data_io.data(filename)) # get a column vector # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open(os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint(verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return label
def loadData(self, filename, verbose=True, replace_missing=True): """ Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse Potentially does not load the data if it is too large """ logger.info("Reading %s", filename) start = time.time() if not os.path.exists(filename): return None if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = { 'dense': data_io.data, 'sparse': data_io.data_sparse, 'sparse_binary': data_io.data_binary_sparse } data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format'] == 'dense' and replace_missing and np.any( map(np.isnan, data)): vprint(verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) end = time.time() if verbose: print("[+] Success in %5.2f sec" % (end - start)) return data
def loadData(self, filename, verbose=True, replace_missing=True): """ Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse Potentially does not load the data if it is too large """ logger.info("Reading %s", filename) start = time.time() if not os.path.exists(filename): return None if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense': data_io.data, 'sparse': data_io.data_sparse, 'sparse_binary': data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format'] == 'dense' and replace_missing and np.any(map(np.isnan, data)): vprint(verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def getInfo(self, filename, verbose=True): ''' Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values''' if filename == None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).rsplit('_')[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile(filename) vprint(verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData( os.path.join(input_dir, basename + '_train.data')) else: vprint(verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info['usage'] = 'No Info File' self.info['name'] = basename # Get the data format and sparsity self.getFormatData( os.path.join(input_dir, basename + '_train.data')) # Assume no categorical variable and no missing value (we'll deal with that later) self.info['has_categorical'] = 0 self.info['has_missing'] = 0 # Get the target number, label number, target type and task self.getTypeProblem( os.path.join(input_dir, basename + '_train.solution')) if self.info['task'] == 'regression': self.info['metric'] = 'r2_metric' else: self.info['metric'] = 'auc_metric' # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info['feat_type'] = 'Mixed' # Get the number of features and patterns self.getNbrFeatures( os.path.join(input_dir, basename + '_train.data'), os.path.join(input_dir, basename + '_test.data'), os.path.join(input_dir, basename + '_valid.data')) self.getNbrPatterns(basename, input_dir, 'train') self.getNbrPatterns(basename, input_dir, 'valid') self.getNbrPatterns(basename, input_dir, 'test') # Set default time budget self.info['time_budget'] = 600 return self.info
def getInfo(self, filename, verbose=True): ''' Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values''' if filename == None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).rsplit('_')[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile(filename) vprint(verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData(os.path.join(input_dir, basename + '_train.data')) else: vprint(verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info['usage'] = 'No Info File' self.info['name'] = basename # Get the data format and sparsity self.getFormatData(os.path.join(input_dir, basename + '_train.data')) # Assume no categorical variable and no missing value (we'll deal with that later) self.info['has_categorical'] = 0 self.info['has_missing'] = 0 # Get the target number, label number, target type and task self.getTypeProblem(os.path.join(input_dir, basename + '_train.solution')) if self.info['task'] == 'regression': self.info['metric'] = 'r2_metric' else: self.info['metric'] = 'auc_metric' # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info['feat_type'] = 'Mixed' # Get the number of features and patterns self.getNbrFeatures(os.path.join(input_dir, basename + '_train.data'), os.path.join(input_dir, basename + '_test.data'), os.path.join(input_dir, basename + '_valid.data')) self.getNbrPatterns(basename, input_dir, 'train') self.getNbrPatterns(basename, input_dir, 'valid') self.getNbrPatterns(basename, input_dir, 'test') # Set default time budget self.info['time_budget'] = 600 return self.info
def loadLabel(self, filename, verbose=True): ''' Get the solution/truth values''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")): with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint( verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'task' not in self.info.keys(): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info['task'] == 'multilabel.classification': label = data_io.data(filename) elif self.info['task'] == 'multiclass.classification': label = data_converter.convert_to_num(data_io.data(filename)) else: label = np.ravel(data_io.data(filename)) # get a column vector # label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open( os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint( verbose, "Saving pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print("[+] Success in %5.2f sec" % (end - start)) return label
else: input_dir = argv[1] output_dir = os.path.abspath(argv[2]) # Move old results and create a new output directory data_io.mvdir(output_dir, output_dir + '_' + the_date) data_io.mkdir(output_dir) # ### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames) > 0: vprint( verbose, "************************************************************************" ) vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******" ) vprint( verbose, "************************************************************************" ) OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint(verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else:
output_dir = default_output_dir else: input_dir = argv[1] output_dir = os.path.abspath(argv[2]) # Move old results and create a new output directory data_io.mvdir(output_dir, output_dir+'_'+the_date) data_io.mkdir(output_dir) # ### INVENTORY DATA (and sort dataset names alphabetically) datanames = data_io.inventory_data(input_dir) # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames) > 0: vprint(verbose, "************************************************************************") vprint(verbose, "****** Attempting to copy files (from res/) for RESULT submission ******") vprint(verbose, "************************************************************************") OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint(verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else: vprint(verbose, "======== Some missing results on current datasets!") vprint(verbose, "======== Proceeding to train/test:\n") # =================== End @RESULT SUBMISSION (KEEP THIS) ================== if zipme and not running_on_codalab: vprint(verbose, "========= Zipping this directory to prepare for submit ==============") ignoredirs = [os.path.abspath(x) for x in glob.glob('./output_*')]
datanames = data_io.inventory_data(input_dir) #### DEBUG MODE: Show dataset list and STOP if debug_mode >= 3: data_io.show_io(input_dir, output_dir) print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n') data_io.write_list(datanames) datanames = [] # Do not proceed with learning and testing # ==================== @RESULT SUBMISSION (KEEP THIS) ===================== # Always keep this code to enable result submission of pre-calculated results # deposited in the res/ subdirectory. if len(datanames) > 0: vprint( verbose, "************************************************************************" ) vprint( verbose, "****** Attempting to copy files (from res/) for RESULT submission ******" ) vprint( verbose, "************************************************************************" ) OK = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE! if OK: vprint(verbose, "[+] Success") datanames = [] # Do not proceed with learning and testing else: