def test_big_tree(): # load data X_train, y_train, X_test, y_test = data.load_decision_tree_data() # set classifier dTree = decision_tree.DecisionTree() # training dTree.train(X_train.tolist(), y_train.tolist()) # print # Utils.print_tree(dTree) # testing y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu) Utils.reduced_error_prunning(dTree, X_test, y_test) y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu) # print Utils.print_tree(dTree)
def pruning_decision_tree_test(): # load data X_train, y_train, X_test, y_test = data.sample_decision_tree_pruning() # build the tree dTree = decision_tree.DecisionTree() dTree.train(X_train, y_train) # print print('Your decision tree:') Utils.print_tree(dTree) print('My decision tree:') print( 'branch 0{\n\tdeep: 0\n\tnum of samples for each class: 5 : 9 \n\tsplit by dim 0\n\tbranch 0->0{\n\t\tdeep: 1' '\n\t\tnum of samples for each class: 3 : 2 \n\t\tsplit by dim 1\n\t\tbranch 0->0->0{\n\t\t\tdeep: 2\n\t\t\t' 'num of samples for each class: 3 \n\t\t\tclass:0\n\t\t}\n\t\tbranch 0->0->1{\n\t\t\tdeep: 2\n\t\t\tnum of ' 'samples for each class: 2 \n\t\t\tclass:1\n\t\t}\n\t}\n\tbranch 0->1{\n\t\tdeep: 1\n\t\tnum of samples for ' 'each class: 4 \n\t\tclass:1\n\t}\n\tbranch 0->2{\n\t\tdeep: 1\n\t\tnum of samples for each class: 2 : 3 ' '\n\t\tsplit by dim 2\n\t\tbranch 0->2->0{\n\t\t\tdeep: 2\n\t\t\tnum of samples for each class: 3 \n\t\t\t' 'class:1\n\t\t}\n\t\tbranch 0->2->1{\n\t\t\tdeep: 2\n\t\t\tnum of samples for each class: 2 \n\t\t\tclass:0' '\n\t\t}\n\t}\n}') Utils.reduced_error_prunning(dTree, X_test, y_test) print('Your decision tree after pruning:') Utils.print_tree(dTree) print('My decision tree after pruning:') print( 'branch 0{\n\tdeep: 0\n\tnum of samples for each class: 5 : 9 \n\tsplit by dim 0\n\tbranch 0->0{\n\t\tdeep: ' '1\n\t\tnum of samples for each class: 3 : 2 \n\t\tsplit by dim 1\n\t\tbranch 0->0->0{\n\t\t\tdeep: 2\n\t\t\t' 'num of samples for each class: 3 \n\t\t\tclass:0\n\t\t}\n\t\tbranch 0->0->1{\n\t\t\tdeep: 2\n\t\t\tnum of ' 'samples for each class: 2 \n\t\t\tclass:1\n\t\t}\n\t}\n\tbranch 0->1{\n\t\tdeep: 1\n\t\tnum of samples for ' 'each class: 4 \n\t\tclass:1\n\t}\n\tbranch 0->2{\n\t\tdeep: 1\n\t\tnum of samples for each class: 2 : 3 ' '\n\t\tclass:1\n\t}\n}')
def decision_tree_test(): features, labels = data.sample_decision_tree_data() # build the tree dTree = decision_tree.DecisionTree() dTree.train(features, labels) # print print('Your decision tree: ') Utils.print_tree(dTree) print('My decision tree: ') print( 'branch 0{\n\tdeep: 0\n\tnum of samples for each class: 2 : 2 \n\tsplit by dim 0\n\tbranch 0->0{\n\t\tdeep: ' '1\n\t\tnum of samples for each class: 1 \n\t\tclass:0\n\t}\n\tbranch 0->1{\n\t\tdeep: 1\n\t\tnum of ' 'samples for each class: 1 : 1 \n\t\tsplit by dim 0\n\t\tbranch 0->1->0{\n\t\t\tdeep: 2\n\t\t\tnum of ' 'samples for each class: 1 \n\t\t\tclass:0\n\t\t}\n\t\tbranch 0->1->1{\n\t\t\tdeep: 2\n\t\t\tnum of ' 'samples for each class: 1 \n\t\t\tclass:1\n\t\t}\n\t}\n\tbranch 0->2{\n\t\tdeep: 1\n\t\tnum of ' 'samples for each class: 1 \n\t\tclass:1\n\t}\n}') # data X_test, y_test = data.sample_decision_tree_test() # testing y_est_test = dTree.predict(X_test) print('Your estimate test: ', y_est_test) print('My estimate test: ', [0, 0, 1])
def check_game_tree(): d = 3 bf = 2 data_size = 1 data_sets = GameTree(probability=[0.8, 0.6], d=d, bf=bf, data_size=data_size, tree_name='kocsis') utils.print_tree(tree=data_sets.tree[-1], d=d, bf=bf, data_name='value')
def t2(): data = np.loadtxt('car.data',delimiter=',') x_train = pd.DataFrame(data) y_train = x_train[0].tolist() x_train = x_train.drop([0],axis=1) x_train = np.array(x_train).tolist() x_test = x_train[1500:] y_test = y_train[1500:] x_train = x_train[:1500] y_train = y_train[:1500] tree = DecisionTree() tree.train(x_train,y_train) p = tree.predict(x_train) U.print_tree(decisionTree=tree) U.reduced_error_prunning(decisionTree=tree,X_test=x_test,y_test=y_test) print('---------------------------') U.print_tree(decisionTree=tree)
def get_minimax_path(tree, bf, d, draw=False): minimax_tree = minimax_algo_nx(tree, bf, d) if draw: utils.print_tree(tree=minimax_tree, d=d, bf=bf) children = list(minimax_tree.successors(0)) root_val = minimax_tree.nodes[0]['value'] path = [0 for i in range(d)] for i in range(d): b_idx = np.argmax([minimax_tree.nodes[children[c]]['value'] for c in range(len(children))]) #b_idx = [minimax_tree.nodes[children[c]]['value']for c in range(len(children))].index(root_val) path[i] = children[b_idx] children = list(minimax_tree.successors(children[b_idx])) return path
def main(): d = 5 bf = 2 data_size = 1 rollout_num = 50 data_set = GameTree(probability=[0.8, 0.6], d=d, bf=bf, data_size=data_size, tree_name='kocsis') results = [[] for i in range(data_size)] accuracy = [[] for i in range(data_size)] for i in range(data_size): ans_path = mini_max.get_minimax_path(tree=data_set.tree[i], d=d, bf=bf, draw=True) results[i], accuracy[i] = mcts(tree=data_set.tree[i], n=rollout_num, ans_path=ans_path, algo_name='UCT') print("{}%done, ans={},results={}".format((float(i) / data_size) * 100, ans_path, results[i])) utils.print_tree(tree=data_set.tree[-1], d=d, bf=bf, data_name='ucb') #print("ans={},minimax_ans={}".format(ans, minimax_ans)) #correct_rate = accuracy(ans, minimax_ans) #print(accuracy) means = np.zeros(rollout_num) accuracy = np.array(accuracy) for i in range(rollout_num): means[i] = np.mean(accuracy[:, i]) print("result = {}".format(results[-1])) print("means = {}".format(means))
def test_tree(): features, labels = data.sample_decision_tree_data() # build the tree dTree = decision_tree.DecisionTree() dTree.train(features, labels) # print Utils.print_tree(dTree) # data X_test, y_test = data.sample_decision_tree_test() # testing y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu) Utils.reduced_error_prunning(dTree, X_test, y_test) y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu)
arg_parser.add_argument('-t', '--tree', help='prints the abstract syntax tree', action='store_true') arg_parser.add_argument('-o', '--optimize', help='optimizes the emitted code', action='store_true') arg_parser.add_argument('-r', '--recompile', help='recompiles the standard library', action='store_true') arg_parser.add_argument('src', help='source file') arg_parser.add_argument('dest', help='destination file', nargs='?', default=None) args = arg_parser.parse_args() try: with open(args.src) as cmm_file: code = cmm_file.read() except OSError as e: print(e, file=sys.stderr) sys.exit(arg_parser.format_usage()) try: stdlib = load_stdlib() tokens = Lexer().tokenize(code) tree = Parser(tokens).parse(os.path.basename(args.src)) code = CodeGenerator(tree, stdlib).generate(args.optimize) if args.tree: print_tree(tree) if args.dest is None: print() if args.dest is not None: with open(args.dest, 'w') as out_file: out_file.write(code) else: print(code) except CompilerError as e: if args.debug: raise e sys.exit(e)
training_data = [ ['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon'], ] # Column labels. # These are used only to print the tree. header = ["color", "diameter", "label"] my_tree = build_tree(training_data) print_tree(my_tree) # Evaluate testing_data = [ ['Green', 3, 'Apple'], ['Yellow', 4, 'Apple'], ['Red', 2, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon'], ] for row in testing_data: print ("Actual: %s. Predicted: %s" % (row[-1], print_results(my_tree.classify(row))))
q = queue.Queue() root = pre_list[0] q.put(root) while not q.empty(): root = q.get() left_index = mid_list.index(root.val) pre_list = pre_list[1:left_index + 1] mid_list = mid_list[left_index:] ''' 根据后序序列,中序序列构造二叉树 递归方法解决 ''' from utils import TreeNode, print_tree def buildTree(postorder, inorder): if not postorder or not inorder: return None tree_val = postorder[-1] root = TreeNode(tree_val) left_index = inorder.index(tree_val) root.left = buildTree(postorder[:left_index], inorder[:left_index]) root.right = buildTree(postorder[left_index:-1], inorder[left_index + 1:]) return root root = buildTree([4, 5, 2, 6, 7, 3, 1], [4, 2, 5, 1, 6, 3, 7]) print_tree(root)
:type head: ListNode :rtype: TreeNode """ def sortedArrayToBST(nums): if not nums: return None n = len(nums) if n == 1: # add these 2 line will speed up, of course return TreeNode(nums[0]) i = int(n / 2) node = TreeNode(nums[i]) node.left = sortedArrayToBST(nums[:i]) node.right = sortedArrayToBST(nums[i + 1:]) return node # to use the ascending order property, transform into index-based array _array = [] while head: _array.append(head.val) head = head.next return sortedArrayToBST(_array) s = Solution() for _ in range(5): st = random.randint(1, 100) gap = random.randint(1, 1000) lst = sorted(random.sample(range(st, st + gap), min(random.randint(1, 20), gap))) print lst head = listToLinkedlist(lst) res = s.sortedListToBST(head) print_tree(res)
def main(): global args args = parse_args(type=1) print(args.name) print(args.model_name) args.input_dim = 300 if args.mem_dim == 0: if args.model_name == 'dependency': args.mem_dim = 168 elif args.model_name == 'constituency': args.mem_dim = 150 elif args.model_name == 'lstm': args.mem_dim = 168 elif args.model_name == 'bilstm': args.mem_dim = 168 if args.num_classes == 0: if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) elif args.num_classes == 2: # assert False # this will not work assert not args.fine_grain args.cuda = args.cuda and torch.cuda.is_available() # args.cuda = False print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files token_files = [ os.path.join(split, 'sents.toks') for split in [train_dir, dev_dir, test_dir] ] vocab_file = os.path.join(args.data, 'vocab-cased.txt') # use vocab-cased # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB, USE OLD VOCAB # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SSTDataset(train_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev dev_file = os.path.join(args.data, 'sst_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(dev_dataset, dev_file) is_preprocessing_data = True # test test_file = os.path.join(args.data, 'sst_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(test_dataset, test_file) is_preprocessing_data = True criterion = nn.NLLLoss() # initialize model, criterion/loss_function, optimizer model = DMNWraper(args.cuda, args.input_dim, args.mem_dim, criterion, args.train_subtrees, args.num_classes, args.embdrop) embedding_model = nn.Embedding(vocab.size(), args.input_dim) if args.cuda: embedding_model = embedding_model.cuda() if args.cuda: model.cuda(), criterion.cuda() # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors if args.embedding == 'glove': emb_torch = 'sst_embed.pth' emb_vector = 'glove.840B.300d' emb_vector_path = os.path.join(args.glove, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'paragram': emb_torch = 'sst_embed_paragram.pth' emb_vector = 'paragram_300_sl999' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') elif args.embedding == 'paragram_xxl': emb_torch = 'sst_embed_paragram_xxl.pth' emb_vector = 'paragram-phrase-XXL' emb_vector_path = os.path.join(args.paragram, emb_vector) assert os.path.isfile(emb_vector_path + '.txt') else: assert False emb_file = os.path.join(args.data, emb_torch) if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(emb_vector_path) print('==> Embedding vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print('quit program') quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() embedding_model.state_dict()['weight'].copy_(emb) if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': # optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adam_combine': optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adagrad_combine': optimizer = optim.Adagrad([{ 'params': model.parameters(), 'lr': args.lr, 'weight_decay': args.wd }, { 'params': embedding_model.parameters(), 'lr': args.emblr, 'weight_decay': args.embwd }]) args.manually_emb = 0 elif args.optim == 'adam_combine_v2': model.embedding_model = embedding_model optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) args.manually_emb = 0 metrics = Metrics(args.num_classes) utils.count_param(model) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_initial_emb(emb) question_idx = vocab.labelToIdx['sentiment'] question_idx = torch.Tensor([question_idx]) trainer.set_question(question_idx) # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer) mode = args.mode if mode == 'DEBUG': for epoch in range(args.epochs): # print a tree tree, sent, label = dev_dataset[3] utils.print_span(tree, sent, vocab) quit() dev_loss = trainer.train(dev_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, _ = trainer.test(test_dataset) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) elif mode == "PRINT_TREE": for i in range(0, 10): ttree, tsent, tlabel = dev_dataset[i] utils.print_tree(ttree, 0) print('_______________') print('break') quit() elif mode == 'EVALUATE': filename = args.name + '.pth' epoch = args.epochs model_name = str(epoch) + '_model_' + filename embedding_name = str(epoch) + '_embedding_' + filename model = torch.load(os.path.join(args.saved, model_name)) embedding_model = torch.load(os.path.join(args.saved, embedding_name)) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_question(question_idx) test_loss, test_pred, subtree_metrics = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print('Epoch with max dev:' + str(epoch) + ' |test percentage ' + str(test_acc)) print('____________________' + str(args.name) + '___________________') print_list = subtree_metrics.print_list torch.save(print_list, os.path.join(args.saved, args.name + 'printlist.pth')) utils.print_trees_file(args, vocab, test_dataset, print_list, name='tree') elif mode == "EXPERIMENT": # dev_loss, dev_pred = trainer.test(dev_dataset) # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes) max_dev = 0 max_dev_epoch = 0 filename = args.name + '.pth' for epoch in range(args.epochs): # train_loss, train_pred, _ = trainer.test(train_dataset) train_loss_while_training = trainer.train(train_dataset) train_loss, train_pred, _ = trainer.test(train_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) dev_acc = metrics.sentiment_accuracy_score( dev_pred, dev_dataset.labels, num_classes=args.num_classes) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels, num_classes=args.num_classes) print('==> Train loss : %f \t' % train_loss_while_training, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch %d dev percentage %f ' % (epoch, dev_acc)) print('Train acc %f ' % (train_acc)) if dev_acc > max_dev: print('update best dev acc %f ' % (dev_acc)) max_dev = dev_acc max_dev_epoch = epoch utils.mkdir_p(args.saved) torch.save( model, os.path.join(args.saved, str(epoch) + '_model_' + filename)) torch.save( embedding_model, os.path.join(args.saved, str(epoch) + '_embedding_' + filename)) gc.collect() print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev)) print('eva on test set ') model = torch.load( os.path.join(args.saved, str(max_dev_epoch) + '_model_' + filename)) embedding_model = torch.load( os.path.join(args.saved, str(max_dev_epoch) + '_embedding_' + filename)) trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) trainer.set_question(question_idx) test_loss, test_pred, _ = trainer.test(test_dataset) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels, num_classes=args.num_classes) print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc)) print('____________________' + str(args.name) + '___________________') else: for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred, _ = trainer.test(train_dataset) dev_loss, dev_pred, _ = trainer.test(dev_dataset) test_loss, test_pred, _ = trainer.test(test_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'train percentage ', train_acc) print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc)
def __init__(self, x): self.val = x self.left = None self.right = None class Solution: def sortedArrayToBST(self, nums: List[int]) -> TreeNode: if not nums: return mid = len(nums) // 2 # mid is element of the root root = TreeNode(nums[mid]) # left subtree of root has all # values <arr[mid] root.left = self.sortedArrayToBST(nums[:mid]) # right subtree of root has all # values >arr[mid] root.right = self.sortedArrayToBST(nums[mid + 1:]) return root solution = Solution() res = solution.sortedArrayToBST([-10, -3, 0, 5, 9]) print_tree(res)
validation_data_file = sys.argv[2] test_data_file = sys.argv[3] prune_factor = float(sys.argv[4]) print('Use training data from %s' % training_data_file) print('Use validation data from %s' % validation_data_file) print('Use training data from %s' % test_data_file) print('Use prune factor: %s' % prune_factor) print('') training_data = utils.read_data(training_data_file) validation_data = utils.read_data(validation_data_file) test_data = utils.read_data(test_data_file) root, node_num, leaf_num = id3.train(training_data) utils.print_tree(root, training_data) print('') print('Pre-Pruned Accuracy') print('- - - - - - - - - - - - -') train_accuracy = id3.test(root, training_data) * 100 print('Number of training instances = %s' % len(training_data)) print('Number of training attributes = %s' % len(training_data[0].feature_map)) print('Total number of nodes in the tree = %s' % node_num) print('Number of leaf nodes in the tree = %s' % leaf_num) print('Accuracy of the model on the training dataset = %.1f%%' % train_accuracy) validation_accuracy = id3.test(root, validation_data) * 100 print('') print('Number of validation instances = %s' % len(validation_data))
layer = node_iter.right break node_iter = node_iter.next return s = Solution() head = TreeLinkNode(0) root = TreeLinkNode(1) head.right = root root.left = TreeLinkNode(2) root.right = TreeLinkNode(3) root.left.left = TreeLinkNode(4) root.left.right = TreeLinkNode(5) root.right.right = TreeLinkNode(7) root.left.left.right = TreeLinkNode(8) root.right.right.left = TreeLinkNode(9) root.left.left.right.right = TreeLinkNode(6) print_tree(head) s.connect(head) inspect = head layer, nxt = [head], [] while layer: for node in layer: print node.val, node.next.val if node.next else 'None' if node.left: nxt.append(node.left) if node.right: nxt.append(node.right) nxt, layer = [], nxt
#best_model, best_k, best_function, best_scaler = model_selection_with_transformation(distance_funcs, scaling_classes, Xtrain, ytrain, Xval, yval) import data import hw1_dt as decision_tree import utils as Utils from sklearn.metrics import accuracy_score features, labels = data.sample_decision_tree_data() # build the tree dTree = decision_tree.DecisionTree() dTree.train(features, labels) # print Utils.print_tree(dTree) # data X_test, y_test = data.sample_decision_tree_test() # testing y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu) """ """ #load data X_train, y_train, X_test, y_test = data.load_decision_tree_data()
def main(write_to): startTime = time.time() global args args = parse_args(type=1) args.input_dim = 300 if args.model_name == 'dependency': args.mem_dim = 168 elif args.model_name == 'constituency': args.mem_dim = 150 if args.fine_grain: args.num_classes = 5 # 0 1 2 3 4 else: args.num_classes = 3 # 0 1 2 (1 neutral) args.cuda = args.cuda and torch.cuda.is_available() # args.cuda = False print(args) # torch.manual_seed(args.seed) # if args.cuda: # torch.cuda.manual_seed(args.seed) # train_dir = os.path.join(args.data,'train/') train_dir = os.path.join( args.data, 'dev/') # Fei: wants to train on a smaller data set # dev_dir = os.path.join(args.data,'dev/') # test_dir = os.path.join(args.data,'test/') # write unique words from all token files token_files = [os.path.join(split, 'sents.toks') for split in [train_dir]] vocab_file = os.path.join(args.data, 'vocab-cased.txt') # use vocab-cased # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB, USE OLD VOCAB # vocab_file = os.path.join(args.data, 'vocab-cased-dev.txt') # build_vocab(token_files, vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=vocab_file) print('==> SST vocabulary size : %d ' % vocab.size()) # Load SST dataset splits is_preprocessing_data = False # let program turn off after preprocess data # train train_file = os.path.join(args.data, 'sst_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SSTDataset(train_dir, vocab, args.num_classes, args.fine_grain, args.model_name) torch.save(train_dataset, train_file) is_preprocessing_data = True # dev # dev_file = os.path.join(args.data,'sst_dev.pth') # if os.path.isfile(dev_file): # dev_dataset = torch.load(dev_file) # else: # dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name) # torch.save(dev_dataset, dev_file) # is_preprocessing_data = True # test # test_file = os.path.join(args.data,'sst_test.pth') # if os.path.isfile(test_file): # test_dataset = torch.load(test_file) # else: # test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name) # torch.save(test_dataset, test_file) # is_preprocessing_data = True criterion = nn.NLLLoss() # initialize model, criterion/loss_function, optimizer model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.num_classes, args.model_name, criterion) embedding_model = nn.Embedding(vocab.size(), args.input_dim) # Fei: don't optimize embedding embedding_model.weight.requires_grad = False if args.cuda: embedding_model = embedding_model.cuda() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': # optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) optimizer = optim.Adagrad( [{ 'params': filter(lambda p: p.requires_grad, model.parameters()), 'lr': args.lr } # Fei: filter non_trainable ], lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) utils.count_param(model) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sst_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] else: emb[vocab.getIndex(word)] = torch.Tensor( emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) is_preprocessing_data = True # flag to quit print('done creating emb, quit') if is_preprocessing_data: print('done preprocessing data, quit program to prevent memory leak') print('please run again') quit() # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) embedding_model.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) loopStart = time.time() #print('prepare time is %s ' % (loopStart - startTime)) loss_save = [] mode = 'EXPERIMENT' if mode == 'DEBUG': for epoch in range(args.epochs): dev_loss = trainer.train(dev_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Dev loss : %f \t' % dev_loss, end="") print('Epoch ', epoch, 'dev percentage ', dev_acc) elif mode == "PRINT_TREE": for i in range(0, 10): ttree, tsent, tlabel = dev_dataset[i] utils.print_tree(ttree, 0) print('_______________') print('break') quit() elif mode == "EXPERIMENT": max_dev = 0 max_dev_epoch = 0 filename = args.name + '.pth' for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) #dev_loss, dev_pred = trainer.test(dev_dataset) #dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") loss_save.append(train_loss) #print('Epoch ', epoch, 'dev percentage ', dev_acc) #torch.save(model, args.saved + str(epoch) + '_model_' + filename) #torch.save(embedding_model, args.saved + str(epoch) + '_embedding_' + filename) #if dev_acc > max_dev: # max_dev = dev_acc # max_dev_epoch = epoch #gc.collect() print("done") #print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev)) #print('eva on test set ') #model = torch.load(args.saved + str(max_dev_epoch) + '_model_' + filename) #embedding_model = torch.load(args.saved + str(max_dev_epoch) + '_embedding_' + filename) #trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer) #test_loss, test_pred = trainer.test(test_dataset) #test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels) #print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc)) #print('____________________' + str(args.name) + '___________________') else: for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_acc = metrics.sentiment_accuracy_score( train_pred, train_dataset.labels) dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels) test_acc = metrics.sentiment_accuracy_score( test_pred, test_dataset.labels) print('==> Train loss : %f \t' % train_loss, end="") print('Epoch ', epoch, 'train percentage ', train_acc) print('Epoch ', epoch, 'dev percentage ', dev_acc) print('Epoch ', epoch, 'test percentage ', test_acc) loopEnd = time.time() print('looptime is %s ' % (loopEnd - loopStart)) prepareTime = loopStart - startTime loopTime = loopEnd - loopStart timePerEpoch = loopTime / args.epochs with open(write_to, "w") as f: f.write("unit: " + "1 epoch\n") for loss in loss_save: f.write(str(loss) + "\n") f.write("run time: " + str(prepareTime) + " " + str(timePerEpoch) + "\n")
# by layer, this is not constant extra space, but suit imperfect tree # if not root: # return # layer, nextlayer = [root], [] # while layer: # n = len(layer) # for i in xrange(n - 1): # layer[i].next = layer[i + 1] # if layer[i].left: # nextlayer.append(layer[i].left) # if layer[i].right: # nextlayer.append(layer[i].right) # if layer[-1].left: # nextlayer.append(layer[-1].left) # if layer[-1].right: # nextlayer.append(layer[-1].right) # layer, nextlayer = nextlayer, [] s = Solution() root = TreeLinkNode(1) root.left = TreeLinkNode(2) root.right = TreeLinkNode(3) root.left.left = TreeLinkNode(4) root.left.right = TreeLinkNode(7) root.right.left = TreeLinkNode(5) root.right.right = TreeLinkNode(6) print_tree(root) s.connect(root) print(root.left.right.next.val)
# S_information_gain = utils.information_gain(S, 'Dedicacion', 'Salva', False) # print('Information gain del atributo Dedicación: ', S_information_gain) # S_information_gain = utils.information_gain(S, 'Humor Docente', 'Salva', False) # print('Information gain del atributo Humor Docente: ', S_information_gain) # S_information_gain = utils.information_gain(S, 'Horario', 'Salva', False) # print('Information gain del atributo Horario: ', S_information_gain) # S_information_gain = utils.information_gain(S, 'Dificultad', 'Salva', False) # print('Information gain del atributo Dificultad: ', S_information_gain) # S_information_gain = utils.information_gain(S, 'Humedad', 'Salva', False) # print('Information gain del atributo Humedad: ', S_information_gain) tree = utils.ID3_algorithm( S, ['Dedicacion', 'Dificultad', 'Horario', 'Humedad', 'Humor Docente'], 'Salva', True, False) utils.print_tree(tree, tree['data'], None, True, '') print() print() print('Aplicacion de ID3 a un segundo conjunto de entrenamiento') print() # Algoritmo aplicado al segundo conjunto de prueba tree2 = utils.ID3_algorithm( S2, ['Dedicacion', 'Dificultad', 'Horario', 'Humedad', 'Humor Docente'], 'Salva', True, False) utils.print_tree(tree2, tree['data'], None, True, '') ############################################# # Ejercicio con el data set del laboratorio #
def main(): if (len(sys.argv) != 5): sys.exit("invalid command-line arguments format") # handling command-line arguments data = DataReader2(sys.argv[1]) data.init_examples() training_set_size = int(sys.argv[2]) num_trials = int(sys.argv[3]) verbose = int(sys.argv[4]) if (verbose != 1 and verbose != 0): sys.exit("invalid command-line argument") if (num_trials < 1): sys.exit("invalid command-line argument") # extract examples and attributes # an example = a feature vector + a label (represented by a tuple) examples = data.get_examples() # a list of examples attributes = data.get_attributes() # a list of attribute names if (training_set_size >= len(examples)): sys.exit("invalid command-line argument") # lists of classification performances (correct rates) # e.x.: [1.0, 0.95, 0.83, ...] correct_rates_id3 = [] correct_rates_prior = [] for i in range(0, num_trials): # a single trial print 'TRIAL NUMBER:', i + 1 print '-' * 30 # randomly pick a training set of size *training_set_size* random.shuffle(examples) training_examples = examples[0:training_set_size] testing_examples = examples[training_set_size:] # a list of actual labels of testing examples actuals = utils.extract_labels(testing_examples) # build a decision tree based on these training examples tree = id3.DTL(training_examples, range(0, len(attributes)), True) # print the structure of the decision tree built from the training set print 'DECISION TREE STRUCTURE' utils.print_tree(tree, attributes) # list of predicted labels using id3 output_id3_1 = utils.trial_id3(tree, testing_examples) # list of predicted labels using prior probability output_prior_1 = utils.trial_priorprob(training_examples, testing_examples) # computes and prints correct rate of this trial correct_rate_id3 = utils.correct_rate(output_id3_1, actuals) correct_rates_id3.append(correct_rate_id3) correct_rate_prior = utils.correct_rate(output_prior_1, actuals) correct_rates_prior.append(correct_rate_prior) print '\n' print 'proportion of correct classification' print 'decision tree:', correct_rate_id3 print 'prior probability:', correct_rate_prior print '\n' if (verbose == 1): output_id3_2 = list(testing_examples) output_prior_2 = list(testing_examples) for j in range(0, len(output_id3_2)): output_id3_2[j][-1] = output_id3_1[j] output_prior_2[j][-1] = output_prior_1[j] print '*' * 10, 'examples in the training set: ', '*' * 10 utils.print_dataset(training_examples, attributes) print '*' * 10, 'examples in the testing set: ', '*' * 10 utils.print_dataset(testing_examples, attributes) print '*' * 10, 'classification by the decision tree: ', '*' * 10 utils.print_dataset(output_id3_2, attributes) print '*' * 10, 'classification by prior probability: ', '*' * 10 utils.print_dataset(output_prior_2, attributes) # other outputs print '*' * 5, 'information', '*' * 5 print 'file:' + sys.argv[1] print 'training set size:', sys.argv[2] print 'testing set size:', len(examples) - int(sys.argv[2]) print 'number of trials:', num_trials mean_tree = utils.mean(correct_rates_id3) mean_prior = utils.mean(correct_rates_prior) print 'mean classification performance (decision tree):', mean_tree print 'mean classification performance (prior probability):', mean_prior
class Solution(object): def generateTrees(self, n): """ :type n: int :rtype: List[TreeNode] Given an integer n, generate all structurally unique BST's (binary search trees) that store values 1...n. """ def g(st, ed): # really shocking to write recursive code accepted without debug if st == ed: return [None] res = [] for i in xrange(st, ed): for l in g(st, i): for r in g(i + 1, ed): root = TreeNode(i) root.left = l root.right = r res.append(root) return res if n < 1: return [] return g(1, n + 1) s = Solution() for tn in s.generateTrees(3): print_tree(tn)
from bplustree import Bplustree from utils import print_tree tree = Bplustree(4) # print(tree.root.keys) # tree.insert(5) tree.insert(7) print("\n") print_tree(tree.root, ' ', 0) tree.insert(8) print("\n") print_tree(tree.root, ' ', 0) tree.insert(9) print("\n") print_tree(tree.root, ' ', 0) tree.insert(10) print("\n") print_tree(tree.root, ' ', 0) tree.insert(13) print("\n") print_tree(tree.root, ' ', 0)