Пример #1
0
def load_data():
  header_types = OrderedDict()
  data = []
  with open(train_filename) as f:
    for line in f:
      if "@ATTRIBUTE" in line:
        _, header, arff_type = line.split()
        header_types[header] = float if arff_type=="NUMERIC" else str
      else:
        row = line[:-1].split(",") #TODO: This is a naive way of splitting, captain.
        row = [header_types[h](v) for h,v in zip(header_types, row)]
        data.append(row)

  with open(test_filename) as f:
    for line in f:
      if "@ATTRIBUTE" not in line:
        row = line[:-1].split(",") #TODO: This is a naive way of splitting, captain.
        row = [header_types[h](v) for h,v in zip(header_types, row)]
        data.append(row)

  headers = header_types.keys()

  # Translate the response into a binary.
  index = headers.index(response)
  translate = lambda row: row[:index] + ["1" if row[index] in "34" else "0"] + row[index+1:]
  data = map(translate, data)

  train, test = split_by_percent(data, train_percentage)

  return headers, train, test
Пример #2
0
def load_DRP(data):
    train_filename = preloaded[data]["filepath"]
    test_filename = preloaded[data]["testdata"]
    train_percentage = preloaded[data]["train_percentage"]
    header = preloaded[data]["response_header"]
    features_to_ignore = preloaded[data]["features_to_ignore"]

    header_types = OrderedDict()
    data = []
    with open(train_filename) as f:
        for line in f:
            if "@attribute" in line:
                _, header, arff_type = line.split()
                header_types[header] = float if arff_type == "numeric" else str
            elif "@relation" in line or "@data" in line or line == "\n":
                pass
            else:
                row = line[:-1].split(
                    ",")  #TODO: This is a naive way of splitting, captain.
                row = [header_types[h](v) for h, v in zip(header_types, row)]
                data.append(row)

    with open(test_filename) as f:
        for line in f:
            if "@attribute" not in line and "@relation" not in line and "@data" not in line and line != "\n":
                row = line[:-1].split(
                    ",")  #TODO: This is a naive way of splitting, captain.
                row = [header_types[h](v) for h, v in zip(header_types, row)]
                data.append(row)

    headers = header_types.keys()

    train, test = split_by_percent(data, train_percentage)

    return headers, train, test, header, features_to_ignore
Пример #3
0
def load_data(data):
    if data not in preloaded:
        raise KeyError("{} is not an available dataset".format(data))
    if data == "DRP":
        return load_DRP(data)
    filename = preloaded[data]["filepath"]
    testdata = preloaded[data]["testdata"]
    correct_types = preloaded[data]["correct_types"]
    train_percentage = preloaded[data]["train_percentage"]
    response_header = preloaded[data]["response_header"]
    features_to_ignore = preloaded[data]["features_to_ignore"]
    with open(filename) as f:
        reader = csv.reader(f)
        data = [row for row in reader]
        headers = data.pop(0)

        for i, row in enumerate(data):
            for j, correct_type in enumerate(correct_types):
                data[i][j] = correct_type(row[j])

        if testdata is None:
            train, test = split_by_percent(data, train_percentage)
        else:
            train = data
            with open(testdata) as f:
                reader = csv.reader(f)
                test = [row for row in reader][1:]  # Ignore headers.

                for i, row in enumerate(test):
                    for j, correct_type in enumerate(correct_types):
                        test[i][j] = correct_type(row[j])
    return headers, train, test, response_header, features_to_ignore
Пример #4
0
def load_data():
    header_types = OrderedDict()
    data = []
    with open(train_filename) as f:
        for line in f:
            if "@attribute" in line:
                _, header, arff_type = line.split()
                header_types[header] = float if arff_type == "numeric" else str
            elif "@relation" in line or "@data" in line or line == "\n":
                pass
            else:
                row = line[:-1].split(
                    ",")  #TODO: This is a naive way of splitting, captain.
                row = [header_types[h](v) for h, v in zip(header_types, row)]
                data.append(row)

    with open(test_filename) as f:
        for line in f:
            if "@attribute" not in line and "@relation" not in line and "@data" not in line and line != "\n":
                row = line[:-1].split(
                    ",")  #TODO: This is a naive way of splitting, captain.
                row = [header_types[h](v) for h, v in zip(header_types, row)]
                data.append(row)

    headers = list(header_types.keys())

    # Translate the response into a binary.
    # index = headers.index(response)
    # translate = lambda row: row[:index] + ["1" if row[index] in "34" else "0"] + row[index+1:]
    # data = map(translate, data)

    train, test = split_by_percent(data, train_percentage)

    return headers, train, test
Пример #5
0
def load_data():
    N = 6000
    headers = [
        "Feature A (i)", "Feature B (2i)", "Feature C (-i)",
        "Constant Feature", "Random Feature", "Outcome"
    ]

    data = [[i, 2*i, -i, 1, random.random(), "A"] for i in range(0,N/2)] + \
            [[i, 2*i, -i, 1, random.random(), "B"] for i in range(N/2,N)]

    train, test = split_by_percent(data, train_percentage)

    return headers, train, test
Пример #6
0
def load_data():
    with open(filename) as f:
        reader = csv.reader(f)
        data = [row for row in reader]
        headers = data.pop(0)

        if max_entries:
            data = random.sample(data, max_entries)

        for i, row in enumerate(data):
            for j, correct_type in enumerate(correct_types):
                data[i][j] = correct_type(row[j])

        train, test = split_by_percent(data, train_percentage)

    return headers, train, test
Пример #7
0
def load_data():
  filename = ""  # file not publicly available
  with open(filename) as f:
    reader = csv.reader(f)
    data = [row for row in reader]
    headers = data.pop(0)

    correct_types = [str] * len(headers) # All categorical.

    for i, row in enumerate(data):
      for j, correct_type in enumerate(correct_types):
        data[i][j] = correct_type(row[j])

    train, test = split_by_percent(data, train_percentage)

  return headers, train, test
Пример #8
0
def load_data():
    with open(filename) as f:
        reader = csv.reader(f)
        data = [row for row in reader]
        headers = data.pop(0)

        if max_entries:
            data = random.sample(data, max_entries)

        for i, row in enumerate(data):
            for j, correct_type in enumerate(correct_types):
                data[i][j] = correct_type(row[j])

            # Replace the numeric age with "young" and "old" categories.
            # Threshold based on: F. Kamiran and T. Calders. Classifying without discriminating.
            data[i][AGE_COL] = "old" if row[AGE_COL] > 25 else "young"

        train, test = split_by_percent(data, train_percentage)

    return headers, train, test
Пример #9
0
def load_from_file(datafile,
                   testdata=None,
                   correct_types=None,
                   train_percentage=2.0 / 3.0,
                   response_header=None,
                   features_to_ignore=None,
                   missing_data_symbol=""):
    with open(datafile) as f:
        reader = csv.reader(f)
        data = [row for row in reader]
        headers = data.pop(0)

        # Set defaults in case they are not handed in as arguments
        if response_header is None:
            response_header = headers[-1]

        if features_to_ignore is None:
            features_to_ignore = []

        if correct_types is None:
            correct_types = get_types(data, [None] * len(headers),
                                      missing_data_symbol)

        for i, row in enumerate(data):
            for j, correct_type in enumerate(correct_types):
                data[i][j] = correct_type(row[j])

        if testdata is None:
            train, test = split_by_percent(data, train_percentage)
        else:
            train = data
            with open(testdata) as f:
                reader = csv.reader(f)
                test = [row for row in reader][1:]  # Ignore headers.

                for i, row in enumerate(test):
                    for j, correct_type in enumerate(correct_types):
                        test[i][j] = correct_type(row[j])

    return headers, train, test, response_header, features_to_ignore