def prune_tree(tree_node, valid_set): valid_answers = valid_set[setup.label] # print ('valid answers before resetting index:') # print(valid_answers) valid_answers.reset_index(drop=True, inplace=True) valid_set = valid_set.drop([setup.label], axis=1) prune_list = [] prune_list = find_potential_prunes(tree_node, prune_list) valid_prediction_list = [] for index, row in valid_set.iterrows(): predicted_label = predict.predict_label(tree_node, row) valid_prediction_list.append(predicted_label) previous_score = assess.set_score(valid_prediction_list, valid_answers) for prune in prune_list: temp_node = copy.deepcopy(prune) prune['attr'] = setup.label prune['left'] = None prune['right'] = None prune['leaf'] = True prune['value'] = temp_node['left']['value'] valid_prediction_list = [] for index, row in valid_set.iterrows(): predicted_label = predict.predict_label(tree_node, row) valid_prediction_list.append(predicted_label) left_score = assess.set_score(valid_prediction_list, valid_answers) prune['value'] = temp_node['right']['value'] valid_prediction_list = [] for index, row in valid_set.iterrows(): predicted_label = predict.predict_label(tree_node, row) valid_prediction_list.append(predicted_label) right_score = assess.set_score(valid_prediction_list, valid_answers) # print('\nprevious score:\t' + str(previous_score) ) # print('left score:\t' + str(left_score)) # print('right score:\t' + str(right_score)) if ((left_score >= previous_score) or (right_score >= previous_score)): # print('tree has been pruned') if (left_score > right_score): prune['value'] = temp_node['left']['value'] # print('new value is left branch value') # else: # #print('new value is right branch value') else: prune = temp_node #print('prune denied') return tree_node
def evaluate_unpruned(train_df, n_partitions): print('\n') part_len = int(len(train_df) / 10) # print('the train dataframe is:') # print (train_df) # print ('and is ' + str(len(train_df)) + " long") depths_total = 0 scores_total = 0 for i in range(n_partitions): test_begin = i * part_len test_end = test_begin + part_len # print('test_begin:\t' + str(test_begin)) # print('test_end:\t' + str(test_end)) test_part = train_df.iloc[test_begin:test_end, :] train_part = train_df.drop(train_df.index[test_begin:test_end]) # print('the test part is:') # print(test_part) # print('the train part is') # print(train_part) # print('\n' * 5) train_part.reset_index(drop=True, inplace=True) answers = test_part[setup.label] answers.reset_index(drop=True, inplace=True) answers = answers.tolist() test_part = test_part.drop([setup.label], axis=1) tree_info = tuple() tree_info = build_tree.decision_tree_learning(train_part, 0) depths_total += tree_info[1] # print('current tree:') # print(tree_info[0]) # print('tree_info[1]:') # print(tree_info[1]) prediction_list = [] for index, row in test_part.iterrows(): predicted_label = predict.predict_label(tree_info[0], row) prediction_list.append(predicted_label) score = assess.set_score(prediction_list, answers) scores_total += score #print ( 'score for test_data with index ' + str(test_begin) + ' to ' + str(test_end) + ' is ' + str (score) + '%') print( colored(('\n' * 5) + 'the average depth is ' + str(depths_total / n_partitions), 'red')) print( colored( 'the average score for unpruned tree is ' + str(scores_total / n_partitions) + '%', 'red'))
def getName(): input = json.loads(request.data)['text'] tkn_txt = preprocess.tokenize_text(input) while ("" in tkn_txt): tkn_txt.remove("") X = [preprocess.extract_features(preprocess.pos_tagger([tkn_txt])[0])] pred = predict.predict_label(X) inputArray = input.split(' ') respObj = { "data": [{ 'word': w, 'isName': p } for w, p in zip(inputArray, pred[0])] } return jsonify(respObj)
def getImage(): try: if 'img' in request.files: fileObj = request.files['img'] in_memory_file = io.BytesIO() fileObj.save(in_memory_file) imgObj = np.fromstring(in_memory_file.getvalue(), dtype=np.uint8) color_image_flag = 1 imgArr = cv2.imdecode(imgObj, color_image_flag) imgLabel = str((predict.predict_label(imgArr))[0]) ret = {'Label': imgLabel, 'status': 1, 'msg': '‘success’'} return jsonify(ret) else: ret = {'Label': None, 'status': 0, 'msg': 'wrong header name'} return jsonify(ret) except Exception as e: err = str(e) ret = {'Label': None, 'status': 0, 'msg': err} return jsonify(ret)
def evaluate_pruned(train_df, n_partitions): print('\n') part_len = int(len(train_df) / 10) # print('the train dataframe is:') # print (train_df) # print ('and is ' + str(len(train_df)) + " long") depths_total = 0 scores_total = 0 for i in range(n_partitions): test_begin = i * part_len test_end = test_begin + part_len # print('test_begin:\t' + str(test_begin)) # print('test_end:\t' + str(test_end)) test_part = train_df.iloc[test_begin:test_end, :] # print('testing part of dataset:') # print( test_part) train_part = train_df.drop(train_df.index[test_begin:test_end]) # print('the test part is:') # print(test_part) # print('the train part is') # print(train_part) # print('\n' * 5) if ((test_begin - part_len) < 0): valid_part = train_df.iloc[-part_len:, :] else: valid_part = train_df.iloc[test_begin - part_len:test_begin, :] # print('validation part of dataset:') # print( valid_part) train_part = train_part.drop(train_part.index[test_begin - part_len:test_begin]) train_part.reset_index(drop=True, inplace=True) tree_info = tuple() tree_info = build_tree.decision_tree_learning(train_part, 0) depths_total += tree_info[1] tree_info = list(tree_info) tree_info[0] = pruning.prune_tree(tree_info[0], valid_part) # print('current tree:') # print(tree_info[0]) # print('tree_info[1]:') # print(tree_info[1]) ########################################################################################################################################### VALIDATION # valid_answers = valid_part[setup.label] # valid_answers.reset_index(drop = True , inplace = True) # valid_part = valid_part.drop([setup.label], axis = 1) # valid_prediction_list = [] # for index, row in valid_part.iterrows(): # predicted_label = predict.predict_label ( tree_info[0], row) # valid_prediction_list.append(predicted_label) # score = assess.set_score (test_prediction_list, valid_answers) # scores_total += score ########################################################################################################################################### TESTING test_answers = test_part[setup.label] test_answers.reset_index(drop=True, inplace=True) test_answers = test_answers.tolist() test_part = test_part.drop([setup.label], axis=1) test_prediction_list = [] for index, row in test_part.iterrows(): predicted_label = predict.predict_label(tree_info[0], row) test_prediction_list.append(predicted_label) score = assess.set_score(test_prediction_list, test_answers) scores_total += score #print ( 'score for test_data with index ' + str(test_begin) + ' to ' + str(test_end) + ' is ' + str (score) + '%') print( colored(('\n' * 5) + 'the average depth is ' + str(depths_total / n_partitions), 'red')) print( colored( 'the average score for pruned tree is ' + str(scores_total / n_partitions) + '%', 'red'))
boot.first_filter_threshold = 1 boot.second_filter_threshold = 1.7 for i in range(len(whole_unlabeled_data) // batch_size + 1): unlabeled_data = whole_unlabeled_data[int(i) * batch_size:(i + 1) * batch_size] train_with_hierarchy(boot, unlabeled_data) train_label = [] with open('./bootstrapped_data/data.txt', 'w') as f: for sentence in boot.best_score_X: f.write(sentence + '\n') with open('./bootstrapped_data/label.txt', 'w') as f: for label in boot.best_score_Y: f.write(boot.inv_target_dict[int(label)] + '\n') train_label.append(boot.inv_target_dict[int(label)]) del whole_unlabeled_data unlabeled_data_for_labeling = pre.load_unlabeled_data( '../data/unlabeled_data/donga_pos_unlabeled_data.txt', dup=True, shuffle=False) if os.path.isfile('./results/DailyLife_MachineLabeled_SG_SVM.txt'): os.remove('./results/DailyLife_MachineLabeled_SG_SVM.txt') for i in range(len(unlabeled_data_for_labeling) // 50000 + 1): predict.predict_label( boot.best_score_X, train_label, unlabeled_data_for_labeling[i * 50000:(i + 1) * 50000])