def test_fit_func_ranking(self): # training dataset train_dtset = Data() # testing dataset test_dtset = Data() train_dtset.attr_file = self.filename1 test_dtset.attr_file = self.filename1 # attributes data train_dtset.read_attr_data() test_dtset.read_attr_data() # poluting training # train to test = 80/20 # random list from 0 to (length of ex - 1) train_index_list = random.sample(xrange(len(self.dataset.examples)), int(len(self.dataset.examples) * 0.8)) train_dtset.examples = [ self.dataset.examples[index] for index in train_index_list if (self.dataset.examples[index] not in train_dtset.examples) ] test_dtset.examples = [ ex for ex in self.dataset.examples if (ex not in train_dtset.examples) ] root = compute_tree(train_dtset, None, None) print test_examples(root, test_dtset)
def main(): args = sys.argv if (len(args) < 2): print("You should provide a filename to data.") filename1 = 'car.c45-names.txt' #attributes filename2 = 'car.data' # data examples else: filename1 = str(sys.argv[0]) filename2 = str(sys.argv[1]) # data examples dataset = Data() dataset.attr_file = filename1 dataset.data_file = filename2 dataset.read_attr_data() dataset.read_examples_data() # Proportion training set to testing set (1 means only training set) PROPORTION = 1 train_dtset = copy.deepcopy(dataset) test_dtset = copy.deepcopy(dataset) train_dtset.examples, test_dtset.examples = [], [] total = len(dataset.examples) # polluting train dataset train_index_list = random.sample(xrange(total), int(total * PROPORTION)) train_dtset.examples = [ dataset.examples[index] for index in train_index_list if (dataset.examples[index] not in train_dtset.examples) ] # polluting test dataset test_dtset.examples = [ ex for ex in dataset.examples if (ex not in train_dtset.examples) ] print("Computing tree...") root = compute_tree(train_dtset, None, None) tree_filename = 'results/tree.txt' with open(tree_filename, "w") as tree_file: write_tree(root, 0, tree_file)
def main(): args = sys.argv filename1 = 'car.c45-names.txt' #attributes filename2 = 'car.data' # data examples # repetition of tree creation M = 10 # subset length of random elements from testing subset N = 800 # repeatition of M time creation of tree NM = 10 # Proportion training set to testing set PROPORTION = 0.9 if ("-n" in args): try: N = int(args[args.index("-n") + 1]) except Exception as e: print(e) return else: print('Parametr for N is not defined.') return if ("-m" in args): try: M = int(args[args.index("-m") + 1]) except Exception as e: print(e) return else: print('Parametr for M is not defined.') return if ("-nm" in args): try: NM = int(args[args.index("-nm") + 1]) except Exception as e: print(e) return else: print('Parametr for NM is not defined.') return dataset = Data() dataset.attr_file = filename1 dataset.data_file = filename2 dataset.read_attr_data() dataset.read_examples_data() print("Computing tree...") # counts each appearance of attr in each tree ranking_list_counting = Counter() for attr in dataset.attr_names: # ranking_list[attr] = 0 ranking_list_counting[attr] = 0 # counting ranking ranking_list_final = [] num_attributes = [] # running method for ranking_list_counting, means that counts number of appearences in each tree for i in range(NM): ranking_list_counting = Counter( {key: 0 for (key, val) in ranking_list_counting.items()}) for i in range(M): # function that includes creating tree processing_for_ranking(dataset, N, ranking_list_counting) num_attributes.append(ranking_list_counting) ranking_list_final.append( final_rank(ranking_list_counting.most_common())) title_rank = 'Ranking atrybutów drzewa wykonujacego sie ' + str( M) + ' razy i rozmiar podzbioru losowego ' + str(N) y_label = "Ranking atrytutów (1 - najczęsciej pojawiający się)" filename = 'results/ranking_N' + str(N) + '_M' + str(M) + '.csv' print( "Ranking attributes for {0} tree repetations and {1} length of random subset." .format(M, N)) diagram_printing_num(ranking_list_final, filename, title_rank, y_label) title_count = 'Ilość atrybutów w drzewie` wykonujacego sie ' + str( M) + ' razy i rozmiar podzbioru losowego ' + str(N) y_label = "Ilość pojawianie się atr. (im wyższy - tym więcej)" filename = 'results/attrs_number_N' + str(N) + '_M' + str(M) + '.csv' print( "Ranking attributes for {0} tree repetations and {1} length of random subset." .format(M, N)) diagram_printing_num(num_attributes, filename, title_count, y_label)