def proxyVote(b, e): vote(firstProducer, firstProducer + 1) proxy = accounts[firstProducer]['name'] retry(config['cleos']['path'] + 'system regproxy ' + proxy) sleep(1.0) for i in range(b, e): voter = accounts[i]['name'] retry(config['cleos']['path'] + 'system voteproducer proxy ' + voter + ' ' + proxy)
def predict(self, other_data, _intv=None): ''' Returns a RDD object which stores index and predictions @param other_data should also be a RDD object; but it does NOT have class label for each sample each item is stored as (index, features) length is the length of other_data ''' from utils import cdist,vote # use intv when splitting one test into multiple chunks if _intv == None: length = other_data.count() _intv = range(length) # Create pair: each test point is associated with a subgroup of train data pairs = other_data.cartesian(self.data_feature) predictions = [] for test_idx in _intv: dist_label_tuple_list = [] # get subset of this test index; collect to do for loop idx_pairs = pairs.filter(lambda (testpoint, trainsubgroup): testpoint[0] == test_idx).collect() # loop through each pair for idx_p in idx_pairs: # find out the train subgroup train_subgroup_idx = idx_p[1][0] # Their Class C = self.data_label.filter(lambda (ind, subgroup): ind == train_subgroup_idx).collect()[0] dist_label_tuple_list.extend(cdist(idx_p[0], idx_p[1], C, self.k)) predictions.append((test_idx, vote(dist_label_tuple_list, self.k))) del idx_pairs return predictions
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def main(): test_data = get_data(args.n_normal, args.n_anomaly, 100) http_data = test_data.to_json(orient="split") headers = { 'Content-Type': 'application/json', } response = requests.post("http://localhost:1234/invocations", headers=headers, data=http_data) predictions = np.array(response.json()) voted_labels = vote(predictions) print("The predictions of the anomaly ensemble model through voting is:th") print(voted_labels)
def test_combo(self, combo, mode='test'): assert mode in ['dev', 'test'] if mode == 'test': input_path = self.test_path gold_data = self.test_gold else: input_path = self.dev_path gold_data = self.dev_gold preds = [self.get_tgt_from_model(model_name) for model_name in combo] voted = utils.vote(preds) acc = utils.evaluate(gold_data, voted) return acc
def predict(self, other_data, _intv=None): ''' Returns a RDD object which stores index and predictions @param other_data should also be a RDD object; but it does NOT have class label for each sample each item is stored as (index, features) length is the length of other_data ''' from utils import cdist, vote # use intv when splitting one test into multiple chunks if _intv == None: length = other_data.count() _intv = range(length) # Create pair: each test point is associated with a subgroup of train data pairs = other_data.cartesian(self.data_feature) predictions = [] for test_idx in _intv: dist_label_tuple_list = [] # get subset of this test index; collect to do for loop idx_pairs = pairs.filter(lambda (testpoint, trainsubgroup): testpoint[0] == test_idx).collect() # loop through each pair for idx_p in idx_pairs: # find out the train subgroup train_subgroup_idx = idx_p[1][0] # Their Class C = self.data_label.filter(lambda (ind, subgroup): ind == train_subgroup_idx).collect()[0] dist_label_tuple_list.extend( cdist(idx_p[0], idx_p[1], C, self.k)) predictions.append((test_idx, vote(dist_label_tuple_list, self.k))) del idx_pairs return predictions
for k in range(len(kernel)): svm.append(SVC(kernel=kernel[k])) score_train[k] = svm[k].fit(Xtr, ytr).score(Xtr, ytr) score_test[k] = svm[k].fit(Xtr, ytr).score(Xte, yte) print("{} frames, training accuracy: {} @ {} kernel".format(nf, score_train[k], kernel[k])) print("{} frames, testing accuracy: {} @ {} kernel".format(nf, score_test[k], kernel[k])) ind_max = np.argmax(score_test) best_kernel[i] = kernel[ind_max] best_predictor.append(svm[ind_max]) high_score_train[i] = score_train[ind_max] high_score_test[i] = score_test[ind_max] # Predictions on all frames classes_test = svm[ind_max].predict(Xte) # Vote prediction for trials, even weight pred_even[i] = utils.vote(classes_test, nf, vote_opt="even") assert pred_even[i].shape == test_labels.shape # Calculate even prediction accuracy acc_even[i] = np.sum(pred_even[i]==test_labels, dtype=np.float32)/num_examples_test # Vote prediction for trials, discount weight pred_disc[i] = utils.vote(classes_test, nf, vote_opt="disc") # Calculate discounted prediction accuracy acc_disc[i] = np.sum(pred_disc[i]==test_labels, dtype=np.float32)/num_examples_test # Vote prediction for trials, logarithmic weight pred_logr[i] = utils.vote(classes_test, nf, vote_opt="logr") # Calculate logarithm prediction accuracy acc_logr[i] = np.sum(pred_logr[i]==test_labels)/num_examples_test # Find best predictor pred_accs = np.array([acc_even, acc_disc, acc_logr]) ind = np.unravel_index(np.argmax(pred_accs, axis=None), pred_accs.shape)
indte = range(Xte.shape[0]) # index of test examples predictions = classifier.predict(input_fn=test_input_fn) clste = np.zeros(yte.shape).astype(int) correct_sum = 0 for pred, ind in zip(predictions, indte): clste[ind] = pred["classes"] probability = pred["probabilities"][clste[ind]] print("Prediction is {} {:.1f}%, expected {}".format( clste[ind], 100 * probability, yte[ind])) if clste[ind] == yte[ind]: correct_sum += 1 acc_te = float(correct_sum / len(indte)) assert abs(acc_te - high_score_test[i]) < 1e-4 # Vote prediction for trials, even weight pred_even[i] = utils.vote(clste, nf, vote_opt="even") assert pred_even[i].shape == test_labels.shape # Calculate even prediction accuracy acc_even[i] = np.sum(pred_even[i] == test_labels, dtype=np.float32) / num_examples_test # Vote prediction for trials, discount weight pred_disc[i] = utils.vote(clste, nf, vote_opt="disc") # Calculate discounted prediction accuracy acc_disc[i] = np.sum(pred_disc[i] == test_labels, dtype=np.float32) / num_examples_test # Vote prediction for trials, logarithmic weight pred_logr[i] = utils.vote(clste, nf, vote_opt="logr") # Calculate logarithm prediction accuracy acc_logr[i] = np.sum(pred_logr[i] == test_labels) / num_examples_test # Times up end_t = time.time()
label = logits.argmax(dim=1).tolist() for item in label: result_list.append(item) fold_result.append(result_list) fold_logits_result.append(result_logits_list) # torch.cuda.empty_cache() return fold_result, fold_logits_result if __name__ == "__main__": print('正在预处理...') fold_all, test_bert_list = utils.main() if do_trian: train(fold_all) # 测试 if do_test: test_dataset = layers.Test_Dataset(test_bert_list) test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False) fold_list, fold_logits_list = test( test_dataloader) # fold_logits_list:(5, 5000, 2) vote_result = utils.vote(fold_list) vote_result_list, fold_logits_sum_list = utils.vote_logits( fold_logits_list) utils.write_csv(vote_result) utils.write_csv2(vote_result_list, fold_logits_sum_list) print('测试结果保存成功,请提交')
def predict(self): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config = config) sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) variable_to_save = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) saver = tf.train.Saver(variable_to_save) saver.restore(sess, self.para.modelLoadPath) if (self.para.dataset == 'SICK'): s1, s2, score, slen1, slen2,idx = self.datas.getTestSet() sc = np.reshape(score, (-1, 1)) feedDatas = [s1, s2, sc, slen1, slen2] _loss, _prob, _y, _prob_mse, _mse, _pr, _a1, _a2 = sess.run([self.tensorDict['loss'], self.tensorDict['prob'], self.tensorDict['y'], self.tensorDict['prob_mse'], self.tensorDict['mse'], self.tensorDict['pearson_r'], self.tensorDict['sent1_annotation'], self.tensorDict['sent2_annotation']], feed_dict = { placeholder: feedData for placeholder, feedData in zip(self.placehodlers, feedDatas)}) print '=======================test phase========================' print 'test set loss: \t' + str(_loss) print 'test set prob_MSE: \t' + str(_prob_mse) print 'test set score_MSE: \t' + str(_mse) print 'test set pearson_r: \t', _pr print 'test set spearman_rho: \t', utils.spearman_rho(_y, sc) utils.analysisBatchMatrixDependency(_a1, slen1) # measure of redundancy of annotation matrix if (self.para.modelType == 6): # inspect the annotation matrix for i in range(10): iid = random.randint(0,len(s1)) sent1 = self.datas.displaySent(s1[iid] , slen1[iid]) sent2 = self.datas.displaySent(s2[iid] , slen2[iid]) annotation1 = np.squeeze(np.transpose(_a1[iid,:slen1[iid],:])) annotation2 = np.squeeze(np.transpose(_a2[iid,:slen2[iid],:])) utils.displayAttentionMat(sent1, annotation1, sent2, annotation2) elif (self.para.dataset == 'WikiQA'): # test set s1, s2, score, slen1, slen2, idx = self.datas.getTestSet() sc = np.reshape(score, (-1, 1)) feedDatas = [s1, s2, sc, slen1, slen2] _loss, _prob_pos, _annotation = sess.run([self.tensorDict['loss'], self.tensorDict['prob_of_positive'], self.tensorDict['sent2_annotation']], feed_dict = {placeholder : feedData for placeholder, feedData in zip(self.placehodlers, feedDatas)}) MRR, MAP = self.datas.evaluateOn(_prob_pos, 'test') print 'test set loss: \t%f' % _loss print 'test set MRR: \t%f' % MRR print 'test set MAP: \t%f' % MAP utils.analysisBatchMatrixDependency(_annotation, slen2) # randomly inspect some questions for i in range(10): iid = random.randint(1, 100) self.datas.displayQuestion(_prob_pos, iid, dataset = 'test') raw_input('Press Enter to continue...') elif (self.para.dataset == 'LBA'): #self.datas.inspectSentByLabel('test', 'Q') # debug use s1, s2, score, slen1, slen2, evalSet_label, ref_label, L = self.datas.getEvalSet('both', label_set = 'all') # inspect all incorrect classificated sample with label Q sc = np.reshape(score, (-1, 1)) labelMap = self.datas.digitLabel M = np.zeros((len(labelMap), len(labelMap)) , dtype = int) # M[i][j]: the number of samples predicted to be category j while true label is i, original label are sorted by their lexicographical order for k in range(len(evalSet_label)): feedDatas = [s1[k * L:(k + 1) * L], s2[k * L:(k + 1) * L], sc[k * L:(k + 1) * L], slen1[k * L:(k + 1) * L], slen2[k * L:(k + 1) * L]] prob_list = [] # for each samples in evaluation set, we have L sentences pairs. # we split these L records in small batches to process since L is too large. for batch_idx in range(int(L / 500 + 1)): batch_datas = [fd[batch_idx * 500 : (batch_idx + 1) * 500] for fd in feedDatas] _, _prob_pos = sess.run([self.tensorDict['loss'], self.tensorDict['prob_of_positive']], feed_dict = {placeholder : feedData for placeholder, feedData in zip(self.placehodlers, batch_datas)}) prob_list.append(_prob_pos) _prob_pos = np.concatenate(prob_list, axis = 0) pred, rk = utils.vote(ref_label[k * L:(k + 1) * L], _prob_pos, top_k = 15) true_label = labelMap[evalSet_label[k]] pred_label = labelMap[pred] M[true_label][pred_label] += 1 if (k % (len(evalSet_label) / 100) == 0): print 'progress: %f' % (1.0 * k / len(evalSet_label)) print M print 'confusion matrix:\t ' print np.array2string(M) print 'over all accuracy: %f' % (1.0 * np.sum(np.diag(M)) / np.sum(M))
def select_new_data(self): self.logger.log('general', 'vote and select') preds = [] # write next batch of extra data into a file input_path = f'{self.exp_dir}/tmp/extra_data.txt' self.data_iterator.get_next(input_path, self.cfg['num_input_data']) src = self.task.get_src_from_file(input_path) for model_name in self.best_combo: tool = model_name.split('.')[2] model_path = self.model_pool[model_name] output_path = f'{self.exp_dir}/tmp/dev.{model_name}.txt' self.predict(tool, model_path, input_path, output_path) tgt = self.task.get_tgt_from_file(output_path) preds.append(tgt) n = len(self.data_pool) self.data_pool[f'd{n}'] = f'{self.exp_dir}/data/d{n}.txt' # get agreement rate for each sentence # for simiulate single model self training if self.cfg.get('self_training', False): model_name = list(self.model_pool.keys())[-1] output_path = f'{self.exp_dir}/tmp/dev.{model_name}.txt' agreements = self.task.get_confidence_from_file(output_path) else: agreements = [] for inst_preds in zip(*preds): agree = utils.get_agreement(inst_preds) # agree = utils.ngram_agreement(sent_preds, 3).mean() # agreement ratio for the sentence agreements.append(agree) # majority vote for each sentence (a bit redundant, but fine) voted = utils.vote(preds) print('src', len(src)) print('voted', len(voted)) print('preds', len(preds)) print('agreement', len(agreements)) # select the instances with high agreement and add as new training data on top of d0 copyfile(self.data_pool['d0'], self.data_pool[f'd{n}']) # ALTERNATIVE: # select the instances with high agreement and add as new training data on top of the previous data version # copyfile(self.data_pool[f'd{n-1}'], self.data_pool[f'd{n}']) num_selected = 0 with open(self.data_pool[f'd{n}'], 'a') as out: for agree, s, t in sorted( zip(agreements, src, voted), reverse=True)[:self.cfg['num_output_data']]: if agree > self.cfg['min_agreement']: num_selected += 1 out.write(f'{s}\t{t}\n') # remember in extra data self.data_iterator.selected.add(s) self.logger.log( 'general', f"selected {num_selected} / {self.cfg['num_input_data']} instances" ) return f'd{n}'
import toml import random from utils import vote, sleep, listProducers config = toml.load('./config.toml') if __name__ == '__main__': vote(0, 0 + config['general']['num_voters']) sleep(1) listProducers()
for fold_idx in range(opt['fold']): print('%dth fold:' % fold_idx) # 准备路径 output = os.path.join(opt['out_dir'], str(fold_idx)) train_file = os.path.join(opt['label_dir'], 'train_%d.csv' % fold_idx) valid_file = os.path.join(opt['label_dir'], 'valid_%d.csv' % fold_idx) # 训练 acc, model = train(opt['train_dir'], train_file, valid_file, output, opt['train_bs'], opt['num_workers'], opt['num_epochs'], opt['max_N'], opt['lr_list'], opt['augment'], opt) acc_list.append(acc) # 保存测试结果 result, _ = final_test(model, opt['test_dir'], opt['test_file']) result_list.append(result) write_csv(os.path.join(output, 'result.csv'), result) print('') # 打印在验证集上的平均准确率 print('total accuracy:', sum(acc_list) / opt['fold']) #投票 final_result = vote(result_list) # 保存投票后的结果 write_csv(os.path.join(opt['out_dir'], 'final_result.csv'), final_result)