def main_tree(): mv_instructions = [ "mvpar", "mvLeftSibl", "mvRightSibl", "mvFirstChild", "mvLastChild", "mvPrevDFS", "mvNextDFS", "mvPrevLeft", "mvNextLeft", "mvPrevNodeValue", "mvPrevNodeType", "mvPrevNodeContext" ] write_instruction = [["wrVal"], ["wrType"]] #1. data load and shuffle value_vocab, type_vocab, quer_data = gene_queries() now = time.time() print("1 data load", now - start_time) random.shuffle(quer_data) #shuffle split_pos = int(len(quer_data) * 0.7) training_data = quer_data[:split_pos] test_data = quer_data[split_pos:] # 2. make instructions set and shuffle feature_num = 3 print("feature_num", feature_num) instructions = gene_feature(mv_instructions, write_instruction, feature_num) now = time.time() print("2 get feature", now - start_time) #random.shuffle(instructions) print("data set size", len(training_data), len(test_data)) # 3. creat tree my_tree = create_tree(training_data, instructions) now = time.time() print("3 get myTree", now - start_time) # 4 . save tree #print(my_tree) import json import pickle with open('myTree2.pickle', 'wb') as f1: pickle.dump(my_tree, f1, protocol=pickle.HIGHEST_PROTOCOL) #with open('myTree.json', 'wb') as outf: #json.dump(my_tree, outf, ensure_ascii=False) now = time.time() print("4 save tree", now - start_time) #5. get probabilistic model model_tree = traverse2model(my_tree) now = time.time() print("5 model", now - start_time) with open('model.pickle', 'wb') as f1: pickle.dump(model_tree, f1, protocol=pickle.HIGHEST_PROTOCOL) #6 evaluate MAP = eval(model_tree, test_data) print("MAP:", MAP) now = time.time() m, s = divmod((now - start_time), 60) h, m = divmod(m, 60) print("number of classifier", length[0]) print("6 evaluate time spend%02d:%02d:%02d: " % (h, m, s))
def main_tree(): mv_instructions = [ "mvpar", "mvLeftSibl", "mvRightSibl", "mvFirstChild", "mvLastChild", "mvPrevDFS", "mvNextDFS", "mvPrevLeft", "mvNextLeft", "mvPrevNodeValue", "mvPrevNodeType", "mvPrevNodeContext" ] write_instruction = [["wrVal"], ["wrType"]] #1. data load and shuffle quer_data = gene_queries()[:300000] now = time.time() print("1 data load", now - start_time) random.shuffle(quer_data) # shuffle trn_data = quer_data[:200000] #test_data = quer_data[200000:300000] test_data = gene_queries(Test_flag=True) test_data = test_data[:100000] split_pos = int(len(trn_data) * 0.7) training_data = trn_data[:split_pos] eval_data = trn_data[split_pos:] ''' print("data len",len(quer_data)) quer_data = quer_data[:300000] #test data test_data = gene_queries(Test_flag=True) random.shuffle(test_data) # shuffle test_data = test_data[:100000] now = time.time() print("1 data load",now- start_time) random.shuffle(quer_data) #shuffle split_pos = int(len(quer_data) * 0.7) training_data = quer_data[:split_pos] eval_data= quer_data[split_pos:] ''' print("data set size", len(training_data), len(eval_data), len(test_data)) # 2. make instructions set and shuffle feature_num = 5 print("feature_num", feature_num) instructions = gene_feature(mv_instructions, write_instruction, feature_num) now = time.time() print("2 get feature", now - start_time) #random.shuffle(instructions) # 3. creat tree my_tree = create_tree(training_data, instructions) now = time.time() print("3 get myTree", now - start_time) #print(my_tree) # 4 . save tree #print(my_tree) import json import pickle #5. get probabilistic model vectorizer = HashingVectorizer( n_features=20, non_negative=True, ) model_tree = traverse2model(my_tree, instructions, vectorizer) now = time.time() print("5 model", now - start_time) #print("model",model_tree) with open('model.pickle', 'wb') as f1: pickle.dump(model_tree, f1, protocol=pickle.HIGHEST_PROTOCOL) #6 evaluate MAP = eval(model_tree, eval_data, vectorizer) print("eval MAP:", MAP) # 7 test MAP_test = eval(model_tree, test_data, vectorizer) print("test MAP:", MAP_test) now = time.time() m, s = divmod((now - start_time), 60) h, m = divmod(m, 60) print("number of classifier", length[0]) print("6 evaluate time spend%02d:%02d:%02d: " % (h, m, s))