def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False, only_info=False): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir: self.input_dir = input_dir else: self.input_dir = input_dir + "/" + basename + "/" if self.use_pickle: if os.path.exists("tmp"): self.tmp_dir = "tmp" elif os.path.exists("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join(self.input_dir, basename + '_public.info') self.info = {} self.getInfo(info_file) # Checkl to see if we should do anything other than gather info if not only_info: self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) Yte = self.loadData(os.path.join(self.input_dir, basename + '_test.solution'), verbose=verbose) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx = [] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 1000) idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) Xtr = Xtr[:, idx] if not Xva is None: Xva = Xva[:, idx] if not Xte is None: Xte = Xte[:, idx] self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr if not Xva is None: self.data['X_valid'] = Xva if not Xte is None: self.data['X_test'] = Xte if not Yte is None: self.data['Y_test'] = Yte
def __init__(self, basename="", input_dir="", verbose=False, replace_missing=True, filter_features=False, max_samples=float('inf')): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir or os.path.isfile(os.path.join(input_dir, basename + '_train.data')) : self.input_dir = input_dir else: self.input_dir = os.path.join (input_dir , basename ) if self.use_pickle: if os.path.exists ("tmp"): self.tmp_dir = "tmp" elif os.path.exists ("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join (self.input_dir, basename + '_public.info') self.info = {} self.getInfo (info_file) self.feat_type = self.loadType (os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} #if True: return Xtr = self.loadData (os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel (os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) max_samples = min(Xtr.shape[0], max_samples) Xtr = Xtr[0:max_samples] Ytr = Ytr[0:max_samples] Xva = self.loadData (os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData (os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx=[] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 1000) idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) Xtr = Xtr[:,idx] Xva = Xva[:,idx] Xte = Xte[:,idx] self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr self.data['X_valid'] = Xva self.data['X_test'] = Xte if GOD_VIEW: try: Yva = self.loadLabel (os.path.join(self.input_dir, basename + '_valid.solution'), verbose=verbose) Yte = self.loadLabel (os.path.join(self.input_dir, basename + '_test.solution'), verbose=verbose) self.data['Y_valid'] = Yva self.data['Y_test'] = Yte except: print("Sadly you are not really the god so can't load solutions for validation and test.")
def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False, only_info=False): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir: self.input_dir = input_dir else: self.input_dir = input_dir + "/" + basename + "/" if self.use_pickle: if os.path.exists("tmp"): self.tmp_dir = "tmp" elif os.path.exists("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join(self.input_dir, basename + '_public.info') self.info = {} self.getInfo(info_file) # Checkl to see if we should do anything other than gather info if not only_info: self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx = [] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 1000) idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) Xtr = Xtr[:, idx] if not Xva is None: Xva = Xva[:, idx] if not Xte is None: Xte = Xte[:, idx] self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr if not Xva is None: self.data['X_valid'] = Xva if not Xte is None: self.data['X_test'] = Xte
def __init__(self, basename="", input_dir="", verbose=False, replace_missing=True, filter_features=False, max_samples=float('inf')): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir or os.path.isfile(os.path.join(input_dir, basename + '_train.data')): self.input_dir = input_dir else: self.input_dir = os.path.join(input_dir, basename) if self.use_pickle: if os.path.exists("tmp"): self.tmp_dir = "tmp" elif os.path.exists("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join(self.input_dir, basename + '_public.info') self.info = {} self.getInfo(info_file) self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} # if True: return Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) max_samples = min(Xtr.shape[0], max_samples) Xtr = Xtr[0:max_samples] Ytr = Ytr[0:max_samples] Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx = [] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 1000) idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) Xtr = Xtr[:, idx] Xva = Xva[:, idx] Xte = Xte[:, idx] self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr self.data['X_valid'] = Xva self.data['X_test'] = Xte
def __init__(self, basename="", input_dir="", verbose=False, replace_missing=True, filter_features=False, max_samples=float('inf')): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir: self.input_dir = input_dir else: self.input_dir = os.path.join(input_dir, basename) if self.use_pickle: if os.path.exists("tmp"): self.tmp_dir = "tmp" elif os.path.exists("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join(self.input_dir, basename + '_public.info') self.info = {} self.getInfo(info_file) self.feat_type = self.loadType(os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} #if True: return Xtr = self.loadData(os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel(os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) max_samples = min(Xtr.shape[0], max_samples) Xtr = Xtr[0:max_samples] Ytr = Ytr[0:max_samples] Xva = self.loadData(os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData(os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx = [] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 100) # perform a PCA if feature number > 100 idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) if fn == 100: pca = PCA(n_components=int(fn)) Xtr = pca.fit_transform(Xtr) #Xtr = Xtr[:,idx] Xva = pca.transform(Xva) Xte = pca.transform(Xte) self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr self.data['X_valid'] = Xva self.data['X_test'] = Xte