def load_data(): header_types = OrderedDict() data = [] with open(train_filename) as f: for line in f: if "@ATTRIBUTE" in line: _, header, arff_type = line.split() header_types[header] = float if arff_type=="NUMERIC" else str else: row = line[:-1].split(",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h,v in zip(header_types, row)] data.append(row) with open(test_filename) as f: for line in f: if "@ATTRIBUTE" not in line: row = line[:-1].split(",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h,v in zip(header_types, row)] data.append(row) headers = header_types.keys() # Translate the response into a binary. index = headers.index(response) translate = lambda row: row[:index] + ["1" if row[index] in "34" else "0"] + row[index+1:] data = map(translate, data) train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_DRP(data): train_filename = preloaded[data]["filepath"] test_filename = preloaded[data]["testdata"] train_percentage = preloaded[data]["train_percentage"] header = preloaded[data]["response_header"] features_to_ignore = preloaded[data]["features_to_ignore"] header_types = OrderedDict() data = [] with open(train_filename) as f: for line in f: if "@attribute" in line: _, header, arff_type = line.split() header_types[header] = float if arff_type == "numeric" else str elif "@relation" in line or "@data" in line or line == "\n": pass else: row = line[:-1].split( ",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h, v in zip(header_types, row)] data.append(row) with open(test_filename) as f: for line in f: if "@attribute" not in line and "@relation" not in line and "@data" not in line and line != "\n": row = line[:-1].split( ",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h, v in zip(header_types, row)] data.append(row) headers = header_types.keys() train, test = split_by_percent(data, train_percentage) return headers, train, test, header, features_to_ignore
def load_data(data): if data not in preloaded: raise KeyError("{} is not an available dataset".format(data)) if data == "DRP": return load_DRP(data) filename = preloaded[data]["filepath"] testdata = preloaded[data]["testdata"] correct_types = preloaded[data]["correct_types"] train_percentage = preloaded[data]["train_percentage"] response_header = preloaded[data]["response_header"] features_to_ignore = preloaded[data]["features_to_ignore"] with open(filename) as f: reader = csv.reader(f) data = [row for row in reader] headers = data.pop(0) for i, row in enumerate(data): for j, correct_type in enumerate(correct_types): data[i][j] = correct_type(row[j]) if testdata is None: train, test = split_by_percent(data, train_percentage) else: train = data with open(testdata) as f: reader = csv.reader(f) test = [row for row in reader][1:] # Ignore headers. for i, row in enumerate(test): for j, correct_type in enumerate(correct_types): test[i][j] = correct_type(row[j]) return headers, train, test, response_header, features_to_ignore
def load_data(): header_types = OrderedDict() data = [] with open(train_filename) as f: for line in f: if "@attribute" in line: _, header, arff_type = line.split() header_types[header] = float if arff_type == "numeric" else str elif "@relation" in line or "@data" in line or line == "\n": pass else: row = line[:-1].split( ",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h, v in zip(header_types, row)] data.append(row) with open(test_filename) as f: for line in f: if "@attribute" not in line and "@relation" not in line and "@data" not in line and line != "\n": row = line[:-1].split( ",") #TODO: This is a naive way of splitting, captain. row = [header_types[h](v) for h, v in zip(header_types, row)] data.append(row) headers = list(header_types.keys()) # Translate the response into a binary. # index = headers.index(response) # translate = lambda row: row[:index] + ["1" if row[index] in "34" else "0"] + row[index+1:] # data = map(translate, data) train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_data(): N = 6000 headers = [ "Feature A (i)", "Feature B (2i)", "Feature C (-i)", "Constant Feature", "Random Feature", "Outcome" ] data = [[i, 2*i, -i, 1, random.random(), "A"] for i in range(0,N/2)] + \ [[i, 2*i, -i, 1, random.random(), "B"] for i in range(N/2,N)] train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_data(): with open(filename) as f: reader = csv.reader(f) data = [row for row in reader] headers = data.pop(0) if max_entries: data = random.sample(data, max_entries) for i, row in enumerate(data): for j, correct_type in enumerate(correct_types): data[i][j] = correct_type(row[j]) train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_data(): filename = "" # file not publicly available with open(filename) as f: reader = csv.reader(f) data = [row for row in reader] headers = data.pop(0) correct_types = [str] * len(headers) # All categorical. for i, row in enumerate(data): for j, correct_type in enumerate(correct_types): data[i][j] = correct_type(row[j]) train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_data(): with open(filename) as f: reader = csv.reader(f) data = [row for row in reader] headers = data.pop(0) if max_entries: data = random.sample(data, max_entries) for i, row in enumerate(data): for j, correct_type in enumerate(correct_types): data[i][j] = correct_type(row[j]) # Replace the numeric age with "young" and "old" categories. # Threshold based on: F. Kamiran and T. Calders. Classifying without discriminating. data[i][AGE_COL] = "old" if row[AGE_COL] > 25 else "young" train, test = split_by_percent(data, train_percentage) return headers, train, test
def load_from_file(datafile, testdata=None, correct_types=None, train_percentage=2.0 / 3.0, response_header=None, features_to_ignore=None, missing_data_symbol=""): with open(datafile) as f: reader = csv.reader(f) data = [row for row in reader] headers = data.pop(0) # Set defaults in case they are not handed in as arguments if response_header is None: response_header = headers[-1] if features_to_ignore is None: features_to_ignore = [] if correct_types is None: correct_types = get_types(data, [None] * len(headers), missing_data_symbol) for i, row in enumerate(data): for j, correct_type in enumerate(correct_types): data[i][j] = correct_type(row[j]) if testdata is None: train, test = split_by_percent(data, train_percentage) else: train = data with open(testdata) as f: reader = csv.reader(f) test = [row for row in reader][1:] # Ignore headers. for i, row in enumerate(test): for j, correct_type in enumerate(correct_types): test[i][j] = correct_type(row[j]) return headers, train, test, response_header, features_to_ignore