def convert(raw_dir, max_features): dataset_dict = util.convert_uci_classif( info, raw_dir, file_name, y_first=True ) # Feature 15 is constant. Thus useless. # Feature 4 is the most important feature (according to random forest). # But with it, the problem is too easy i.e., most decent learning algorithms classify the test perfectly. return util.remove_features( dataset_dict, [4,15] )
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ return util.convert_uci_classif(info, raw_dir, file_name_list, delimiter=' ')
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ return util.convert_uci_classif(info, raw_dir, file_name, converters={0: util.convert_date})
def convert(raw_dir, max_features): util.untar(raw_dir, file_name) file_name_list = ["census-income.data", "census-income.data"] # We remove feature 24 (instance weight). columns = range(24) + range(25, 42) return util.convert_uci_classif(info, raw_dir, file_name_list, delimiter=", ", usecols=columns)
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ info['x'], info['y'] = util.convert_uci_classif( info['x_type'], info['y_type'], raw_dir,'anneal.data') return info
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ data_dir = os.path.join( raw_dir, 'ml-prove' ) file_name_list = ['test.csv', 'validation.csv', 'train.csv' ] dataset_dict = util.convert_uci_classif( info, data_dir, file_name_list ) return util.remove_features(dataset_dict, [51,53])
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ data_dir = os.path.join(raw_dir, 'ml-prove') file_name_list = ['test.csv', 'validation.csv', 'train.csv'] dataset_dict = util.convert_uci_classif(info, data_dir, file_name_list) return util.remove_features(dataset_dict, [51, 53])
def convert(raw_dir, max_features): util.unzip(raw_dir, file_name) data_dir = os.path.join(os.path.join(raw_dir, "PAMAP2_Dataset"), "Protocol") file_name_list = ["subject10%d.dat" % x for x in range(1, 10)] # We remove features 0 (timestamp), 16-19, 33-36 and 50-53 (invalid). columns = range(1, 16) + range(20, 33) + range(37, 50) return util.convert_uci_classif(info, data_dir, file_name_list, stride=4, delimiter=" ", y_first=True, usecols=columns)
def convert(raw_dir, max_features): dataset_dict = util.convert_uci_classif(info, raw_dir, file_name, y_first=True) # Feature 15 is constant. Thus useless. # Feature 4 is the most important feature (according to random forest). # But with it, the problem is too easy i.e., most decent learning algorithms classify the test perfectly. return util.remove_features(dataset_dict, [4, 15])
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ columns = range(1, 32) return util.convert_uci_classif(info, raw_dir, file_name, y_first=True, usecols=columns)
def convert(raw_dir, max_features): # extract only one line out of 10 since the dataset fails to load in memory (using numpy.loadtxt with str data type) file_name_ = "kddcup_sub.data" with gzip.open( path.join(raw_dir, file_name ), 'r') as fd_read: with open( path.join(raw_dir, file_name_), 'w') as fd_write: for i,line in enumerate(fd_read): if i%10 == 0: fd_write.write( line ) return util.convert_uci_classif(info, raw_dir, file_name_)
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ # In the data file, one instance is splitted over two lines. # The following code write a fixed file. fixed_file_name = file_name + '2' with open(path.join(raw_dir,file_name)) as f_in: with open(path.join(raw_dir,fixed_file_name),'w') as f_out: for line in f_in.readlines(): line = line.strip() f_out.write(line) if len(line) > 0 and line[-1] != ',': f_out.write('\n') return util.convert_uci_classif(info, raw_dir, fixed_file_name)
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ # In the data file, one instance is splitted over two lines. # The following code write a fixed file. fixed_file_name = file_name + '2' with open(path.join(raw_dir, file_name)) as f_in: with open(path.join(raw_dir, fixed_file_name), 'w') as f_out: for line in f_in.readlines(): line = line.strip() f_out.write(line) if len(line) > 0 and line[-1] != ',': f_out.write('\n') return util.convert_uci_classif(info, raw_dir, fixed_file_name)
def convert(raw_dir, max_features): return util.convert_uci_classif(info, raw_dir, file_name)
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ return util.convert_uci_classif( info, raw_dir, file_name )
def convert(raw_dir, max_features): """ returns a dictionary containing the required fields for the dataset. """ columns = range(1,32) return util.convert_uci_classif( info, raw_dir, file_name, y_first=True, usecols=columns )