Exemplo n.º 1
0
def main():
    args = sys.argv
    if (len(args) < 2):
        print("You should provide a filename to data.")
        filename1 = 'car.c45-names.txt'  #attributes
        filename2 = 'car.data'  # data examples
    else:
        filename1 = str(sys.argv[0])
        filename2 = str(sys.argv[1])  # data examples

    dataset = Data()
    dataset.attr_file = filename1
    dataset.data_file = filename2

    dataset.read_attr_data()
    dataset.read_examples_data()

    # Proportion training set to testing set (1 means only training set)
    PROPORTION = 1

    train_dtset = copy.deepcopy(dataset)
    test_dtset = copy.deepcopy(dataset)
    train_dtset.examples, test_dtset.examples = [], []

    total = len(dataset.examples)

    # polluting train dataset
    train_index_list = random.sample(xrange(total), int(total * PROPORTION))
    train_dtset.examples = [
        dataset.examples[index] for index in train_index_list
        if (dataset.examples[index] not in train_dtset.examples)
    ]

    # polluting test dataset
    test_dtset.examples = [
        ex for ex in dataset.examples if (ex not in train_dtset.examples)
    ]

    print("Computing tree...")
    root = compute_tree(train_dtset, None, None)
    tree_filename = 'results/tree.txt'
    with open(tree_filename, "w") as tree_file:
        write_tree(root, 0, tree_file)
Exemplo n.º 2
0
def main():
    args = sys.argv

    filename1 = 'car.c45-names.txt'  #attributes
    filename2 = 'car.data'  # data examples

    # repetition of tree creation
    M = 10
    # subset length of random elements from testing subset
    N = 800
    # repeatition of M time creation of tree
    NM = 10
    # Proportion training set to testing set
    PROPORTION = 0.9

    if ("-n" in args):
        try:
            N = int(args[args.index("-n") + 1])
        except Exception as e:
            print(e)
            return
    else:
        print('Parametr for N is not defined.')
        return

    if ("-m" in args):
        try:
            M = int(args[args.index("-m") + 1])
        except Exception as e:
            print(e)
            return
    else:
        print('Parametr for M is not defined.')
        return

    if ("-nm" in args):
        try:
            NM = int(args[args.index("-nm") + 1])
        except Exception as e:
            print(e)
            return
    else:
        print('Parametr for NM is not defined.')
        return

    dataset = Data()
    dataset.attr_file = filename1
    dataset.data_file = filename2

    dataset.read_attr_data()
    dataset.read_examples_data()

    print("Computing tree...")

    # counts each appearance of attr in each tree
    ranking_list_counting = Counter()

    for attr in dataset.attr_names:
        # ranking_list[attr] = 0
        ranking_list_counting[attr] = 0

    # counting ranking
    ranking_list_final = []
    num_attributes = []

    # running method for ranking_list_counting, means that counts number of appearences in each tree
    for i in range(NM):
        ranking_list_counting = Counter(
            {key: 0
             for (key, val) in ranking_list_counting.items()})

        for i in range(M):
            # function that includes creating tree
            processing_for_ranking(dataset, N, ranking_list_counting)

        num_attributes.append(ranking_list_counting)
        ranking_list_final.append(
            final_rank(ranking_list_counting.most_common()))

    title_rank = 'Ranking atrybutów drzewa wykonujacego sie ' + str(
        M) + ' razy i rozmiar podzbioru losowego ' + str(N)
    y_label = "Ranking atrytutów (1 - najczęsciej pojawiający się)"
    filename = 'results/ranking_N' + str(N) + '_M' + str(M) + '.csv'
    print(
        "Ranking attributes for {0} tree repetations and {1} length of random subset."
        .format(M, N))
    diagram_printing_num(ranking_list_final, filename, title_rank, y_label)

    title_count = 'Ilość atrybutów w drzewie` wykonujacego sie ' + str(
        M) + ' razy i rozmiar podzbioru losowego ' + str(N)
    y_label = "Ilość pojawianie się atr. (im wyższy - tym więcej)"
    filename = 'results/attrs_number_N' + str(N) + '_M' + str(M) + '.csv'
    print(
        "Ranking attributes for {0} tree repetations and {1} length of random subset."
        .format(M, N))
    diagram_printing_num(num_attributes, filename, title_count, y_label)