def start(self): """Main training loop.""" for i in range(CFG.num_iterations): print("Iteration", i + 1) training_data = [] # list to store self play states, pis and vs for j in range(CFG.num_games): print("Start Training Self-Play Game", j + 1) game = self.game.clone() # Create a fresh clone for each game. self.play_game(game, training_data) # Save the current neural network model. self.net.save_model() # Load the recently saved model into the evaluator network. self.eval_net.load_model() # Train the network using self play values. self.net.train(training_data) # Initialize MonteCarloTreeSearch objects for both networks. current_mcts = MonteCarloTreeSearch(self.net) eval_mcts = MonteCarloTreeSearch(self.eval_net) evaluator = Evaluate(current_mcts=current_mcts, eval_mcts=eval_mcts, game=self.game) wins, losses = evaluator.evaluate() print("wins:", wins) print("losses:", losses) num_games = wins + losses if num_games == 0: win_rate = 0 else: win_rate = wins / num_games print("win rate:", win_rate) if win_rate > CFG.eval_win_rate: # Save current model as the best model. print("New model saved as best model.") self.net.save_model("best_model") else: print("New model discarded and previous model loaded.") # Discard current model and use previous best model. self.net.load_model()
def evaluateVSM(targeEventFile, collFolder,k,relevTh,vsmClassifierFileName,topK): ''' docs = [] try: classifierFile = open(vsmClassifierFileName,"rb") classifier = pickle.load(classifierFile) classifierFile.close() except: f = open(targeEventFile,'r') for url in f: url = url.strip() d = Document(url) if d: docs.append(d) f.close() docsTF = [] for d in docs: wordsFreq = getFreq(d.getWords()) docsTF.append(wordsFreq) classifier = VSMClassifier(docsTF,relevTh) evalres = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() r = classifier.calculate_score(ftext)[0] evalres.append(r) f.close() ''' evaluator = Evaluate() evaluator.buildVSMClassifier(targeEventFile,vsmClassifierFileName,relevTh,topK) collFiles = [] for j in range(k): fn = collFolder+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() o = myObj() o.text = ftext collFiles.append(o) res = evaluator.evaluateFC(collFiles) #f = open(collFolder+'evaluationRes_VSM.txt','w') #f.write('\n'.join([str(r) for r in res])) #f.close() #print sum(res) return res
def _mock_evaluate(self): """ Create a mock `Evaluate` class, with all methods that access the influxdb database stubbed out. Returns: Evaluate: An instance of the Evaluate class. """ evaluate = Evaluate(self.PIT, self.TP, self.VERSION, self._get_tmp_dir(), False, False) evaluate.query_influx = MagicMock() evaluate.query_influx = InfluxDBMock.query_influx return evaluate
def main(args): if args.save_path is not None and not os.path.exists(args.save_path): os.makedirs(args.save_path) summary_writer = tf.summary.FileWriter(os.path.join(args.save_path, 'log')) global_steps_counter = itertools.count() # thread-safe global_net = Net(S_DIM, A_DIM, 'global', args) num_workers = args.threads workers = [] # create workers for i in range(1, num_workers + 1): worker_summary_writer = summary_writer if i == 0 else None worker = Worker(i, make_env(args), global_steps_counter, worker_summary_writer, args) workers.append(worker) saver = tf.train.Saver(max_to_keep=5) with tf.Session() as sess: coord = tf.train.Coordinator() if args.model_path is not None: print('Loading model...\n') ckpt = tf.train.get_checkpoint_state(args.model_path) saver.restore(sess, ckpt.model_checkpoint_path) else: print('Initializing a new model...\n') sess.run(tf.global_variables_initializer()) print_params_nums() # Start work process for each worker in a seperated thread worker_threads = [] for worker in workers: t = threading.Thread(target=lambda: worker.run(sess, coord, saver)) t.start() time.sleep(0.5) worker_threads.append(t) if args.eval_every > 0: evaluator = Evaluate( global_net, summary_writer, global_steps_counter, args) evaluate_thread = threading.Thread( target=lambda: evaluator.run(sess, coord)) evaluate_thread.start() coord.join(worker_threads)
def evaluateClassifier(classifierFile,cf,k): evaluator = Evaluate() evaluator.buildClassifier("posFile","negFolder",classifierFile) collFiles = [] for j in range(k): fn = cf+str(j)+'.txt' f = codecs.open(fn, encoding='utf-8') ftext = f.read() o = myObj() o.text = ftext collFiles.append(o) res = evaluator.evaluateFC(collFiles) f = open(cf+'evaluationRes_Classf.txt','w') f.write('\n'.join([str(r) for r in res])) f.close() print sum(res)
def __call__(self): all_counts = defaultdict(dict) gold = sorted(Reader(open(self.gold))) for path in self.systems: system = sorted(Reader(open(path))) for match, per_doc, overall in Evaluate.count_all(system, gold): all_counts[match][path] = (per_doc, overall) results = [{'sys1': sys1, 'sys2': sys2, 'match': match, 'stats': self.significance(match_counts[sys1], match_counts[sys2])} for sys1, sys2 in itertools.combinations(self.systems, 2) for match, match_counts in sorted(all_counts.iteritems(), key=lambda (k, v): MATCHES.index(k))] return self.fmt(results, self.metrics)
def compute_scores(raw_data_dir=FLAGS.raw_data, data_dir=FLAGS.data_dir, dataset=FLAGS.dataset, save_recommendation=FLAGS.saverec, train_dir=FLAGS.train_dir, test=FLAGS.test): from evaluate import Evaluation as Evaluate evaluation = Evaluate(raw_data_dir, test=test) R = recommend(evaluation.get_uids(), data_dir=data_dir) evaluation.eval_on(R) scores_self, scores_ex = evaluation.get_scores() mylog("====evaluation scores (NDCG, RECALL, PRECISION, MAP) @ 2,5,10,20,30====") mylog("METRIC_FORMAT (self): {}".format(scores_self)) mylog("METRIC_FORMAT (ex ): {}".format(scores_ex)) if save_recommendation: name_inds = os.path.join(train_dir, "indices.npy") np.save(name_inds, rec)
def walk_proximity(self, trained=True, num_walks=100, walk_length=40, workers=5): if trained: return np.loadtxt(self.walk_structure_embedding) walk_structure = utils.walk_proximity(self.graph.adj, num_walks, walk_length, workers=workers) print('游走已完成...') loss = Evaluate(10).loss() auto_encoder = SparseAE(self.args, walk_structure, loss, self.walk_structure_embedding) embedding = auto_encoder.train(parallel=False) return embedding
def main(): url = "https://race.netkeiba.com/?pid=race_old&id=n201908050411" html = requests.get(url) soup = BeautifulSoup(html.content, 'lxml') race_name, distance = Get_Race_Info(soup) print(race_name) link_list, horse_list = Get_Link_List(soup) #print(link_list) for link_url, horse_name in zip(link_list, horse_list): df = Scraping(link_url) print(horse_name) #print(df) ave_list = Evaluate(df, distance) print(ave_list)
def main(): prog = "python -m allennlp.run" subcommand_overrides = {} # pylint: disable=dangerous-default-value parser = argparse.ArgumentParser(description="Run AllenNLP", usage='%(prog)s', prog=prog) print(parser) subparsers = parser.add_subparsers(title='Commands', metavar='') subcommands = { # Default commands "train": Train(), "evaluate": Evaluate(), "evaluate_mlqa": Evaluate_MLQA(), "make-vocab": MakeVocab(), "fine-tune": FineTune(), # Superseded by overrides **subcommand_overrides } for name, subcommand in subcommands.items(): subparser = subcommand.add_subparser(name, subparsers) subparser.add_argument('--include-package', type=str, action='append', default=[], help='additional packages to include') args = parser.parse_args() # If a subparser is triggered, it adds its work as `args.func`. # So if no such attribute has been added, no subparser was triggered, # so give the user some help. if 'func' in dir(args): # Import any additional modules needed (to register custom classes). for package_name in args.include_package: import_submodules(package_name) args.func(args) else: parser.print_help()
def main(out_dir, input_file, input_plus, input_minus, fa_file, keep_temp, window, name, model, rst, threshold, penality, DB_file): if not os.path.exists(out_dir): os.makedirs(out_dir) out_dir = out_dir + '/' + name ####Generate sliding windlows Generate_windows(out_dir, input_file, input_plus, input_minus, fa_file, keep_temp, window, name) data_dir = out_dir + '/data' data_files = glob.glob(data_dir + "/*") for data in data_files: if 'wig' in data: continue baseName = data.split('/')[-1] Evaluate(model, out_dir, rst, window, baseName, keep_temp) Scan_Forward(baseName, threshold, penality, out_dir) Scan_Backward(baseName, threshold, penality, out_dir) if (keep_temp != 'yes'): predict_file = out_dir + '/predict/' + baseName + '.txt' os.system('rm %s' % predict_file) Postprocess(DB_file, baseName, threshold, penality, out_dir) if (keep_temp != 'yes'): forward_file = out_dir + "/maxSum/%s.forward.%d.%d.txt" % ( baseName, threshold, penality) backward_file = out_dir + "/maxSum/%s.backward.%d.%d.txt" % ( baseName, threshold, penality) os.system('rm %s %s' % (forward_file, backward_file)) out_file = '%s/%s.predicted.txt' % (out_dir, name) ww = open(out_file, 'w') ww.write('predicted_pasid\tdb_diff\tdb_pasid\tscore\n') ww.close() os.system('cat %s/maxSum/*bidirection* >>%s' % (out_dir, out_file)) if (keep_temp != 'yes'): os.system('rm -rf %s/data %s/predict %s/maxSum' % (out_dir, out_dir, out_dir)) print("Job Done!")
def __call__(self): all_counts = defaultdict(dict) gold = sorted(Reader(open(self.gold))) for path in self.systems: system = sorted(Reader(open(path))) for match, per_doc, overall in Evaluate.count_all(system, gold): all_counts[match][path] = (per_doc, overall) results = [ { 'sys1': sys1, 'sys2': sys2, 'match': match, 'stats': self.significance(match_counts[sys1], match_counts[sys2]) } for sys1, sys2 in itertools.combinations(self.systems, 2) for match, match_counts in sorted( all_counts.iteritems(), key=lambda (k, v): MATCHES.index(k)) ] return self.fmt(results, self.metrics)
def _train(self, fold_n, x_trn, y_trn, x_val, y_val, num_class=2): # 初始化模型 model, emb = self.create_model(num_class) opt = Adam(0.01) model.compile(optimizer=opt, loss=categorical_crossentropy) patient, best_score = 0, 0 best_embedding = None for epoch in range(2000): generator = utils.batch_iter(x_trn, self.batch_size) for index in generator: if self.types == 'classes': model.train_on_batch([x_trn[index]], np.eye(num_class)[y_trn[index]]) if self.types == 'link': vi, vj = x_trn[index][:, 0], x_trn[index][:, 1] model.train_on_batch( [vi, vj], np.eye(num_class)[y_trn[index].reshape(-1).astype( int)]) if self.types == 'classes': y_val_pred = np.argmax(model.predict([x_val]), -1) micro, macro = Evaluate.f1(y_val, y_val_pred) print('fold_{}:,{},{}'.format(fold_n, micro, macro)) score = micro + macro if self.types == 'link': y_val_pred = np.argmax( model.predict([x_val[:, 0], x_val[:, 1]]), -1) score = roc_auc_score(y_val, y_val_pred) print('fold_{}:,{},{}'.format(fold_n, score, best_score)) if score > best_score: patient = 0 best_score = score best_embedding = emb.get_weights()[0] patient += 1 if patient >= 50: break return best_embedding
def __call__(self, number_of_iterations=2, learning_rate=0.005, embedding_size=300, hidden_size=100, batch_size=100): print("Starting 'Image Retrieval' in 'GRU' mode with '" + self.difficulty + "' data") self.model_full_path = self.model_path + "/" + self.model_name + "_" + self.timestamp + "_" + str( learning_rate) + "_" + str(embedding_size) + ".pty" self.output_file_name = self.output_path + "/" + self.model_name + "_" + self.timestamp + "_" + str( learning_rate) + "_" + str(embedding_size) + ".csv" self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate self.embedding_size = embedding_size self.hidden_size = hidden_size self.batch_size = batch_size self.model = GRU(self.nwords, self.embedding_size, self.image_feature_size, self.output_vector_size, self.hidden_size, self.batch_size) self.criterion = nn.CrossEntropyLoss() self.evaluate = Evaluate(self.model, self.img_features, self.minibatch, self.preprocess, self.image_feature_size, self.output_vector_size) print(self.model) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.train_loss_values = [] self.magic() self.save_model() self.save_data()
def test(RL): env = envR(show=False) path, cost, density, num_find_target, opt_cost = [], [], [], 0, [] evaluate = Evaluate(rows=10, cols=10) train = False succ = 0 print("****************************************************") for episode in range(100): pre_maps = env.reset() step = 0 evaluate.set_start(start_pos=env.agent) evaluate.set_goals(real_pos=env.maze.food_pos[0], fake_pos=env.maze.food_pos[1]) # print("****************************************************") # print("EPISODE ", episode) # start_test = time.time() for step in range(100): action = RL.choose_action(str(pre_maps), train) reward, done, action_ = env.step(action) path.append(action_) step += 1 if done: succ += 1 cost, density, num_find_target, opt_cost = evaluation( evaluate, cost, density, num_find_target, opt_cost, path) path = [] break pre_maps = env.get_maps() print('This is ', episode, 'cost:', step, 'succ', succ) print('average cost:', np.mean(cost), ' average density:', np.mean(density), ' deceptive extent:', num_find_target / succ) print('optimal cost:', np.mean(opt_cost)) print()
class Train(): def __init__(self, difficulty): self.data_path = "../data" self.model_path = "../models" self.output_path = "../outputs" self.difficulty = difficulty self.timestamp = str(int(time.time())) self.model_name = "regression_" + self.difficulty self.data = Data(difficulty=self.difficulty, data_path=self.data_path) (self.img_features, self.w2i, self.i2w, self.nwords, self.UNK, self.PAD) = self.data() self.train = list(self.data.get_train_data()) self.dev = list(self.data.get_validation_data()) self.test = list(self.data.get_test_data()) self.image_feature_size = 2048 self.output_vector_size = 10 def __call__(self, number_of_iterations=2, learning_rate=0.005, embedding_size=300): print("Starting 'Image Retrieval' in 'Regression' mode with '" + self.difficulty + "' data") self.model_full_path = self.model_path + "/" + self.model_name + "_" + self.timestamp + "_" + str( learning_rate) + "_" + str(embedding_size) + ".pty" self.output_file_name = self.output_path + "/" + self.model_name + "_" + self.timestamp + "_" + str( learning_rate) + "_" + str(embedding_size) + ".csv" self.number_of_iterations = number_of_iterations self.learning_rate = learning_rate self.embedding_size = embedding_size self.model = Regression(self.nwords, self.embedding_size, self.image_feature_size, self.output_vector_size) self.criterion = nn.MSELoss() self.evaluate = Evaluate(self.model, self.img_features, self.minibatch, self.preprocess, self.image_feature_size) print(self.model) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self.train_loss_values = [] self.dev_loss_values = [] self.test_loss_values = [] self.magic() self.save_model() self.save_data() def minibatch(self, data, batch_size=50): for i in range(0, len(data), batch_size): yield data[i:i + batch_size] def preprocess(self, batch): """Helper function for functional batches""" correct_indexes = [observation[2] for observation in batch] img_ids = [observation[1] for observation in batch] text_features = [observation[0] for observation in batch] #Add Padding to max len of sentence in batch max_length = max(map(len, text_features)) text_features = [ txt + [self.PAD] * (max_length - len(txt)) for txt in text_features ] #return in "stacked" format return text_features, img_ids, correct_indexes def magic(self): for ITER in range(self.number_of_iterations): random.shuffle(self.train) train_loss = 0.0 start = time.time() for iteration, batch in enumerate(self.minibatch(self.train)): #Outputs matrices of batch size text_features, h5_ids, correct_index = self.preprocess(batch) lookup_text_tensor = Variable(torch.LongTensor([text_features ])).squeeze() target = np.empty([len(batch), self.image_feature_size]) for obs, img_ids in enumerate(h5_ids): target[obs] = self.img_features[img_ids[ correct_index[obs]]] target = Variable( torch.from_numpy(target).type(torch.FloatTensor)) #Run model and calculate loss prediction = self.model(lookup_text_tensor) loss = self.criterion(prediction, target) train_loss += loss.data[0] self.optimizer.zero_grad() self.model.zero_grad() loss.backward() self.optimizer.step() #if iteration % verbosity_interval == 0: # print("ITERATION %r: %r: train loss/sent=%.4f, time=%.2fs" % (ITER+1, iteration, train_loss/(iteration + 1), time.time() - start)) print( "ITERATION %r: train loss/sent=%.4f, time=%.2fs" % (ITER + 1, train_loss / len(self.train), time.time() - start)) #print("Score on training", evaluate(train)) #print("Score on development", evaluate(dev)) self.train_loss_values.append(train_loss / len(self.train)) self.dev_loss_values.append(self.evaluate.calculate_loss(self.dev)) self.test_loss_values.append( self.evaluate.calculate_loss(self.test)) def save_model(self): #Save model torch.save(self.model, self.model_full_path) print("Saved model has test score", self.evaluate(self.test)) def plot(self): plt.plot(self.train_loss_values, label="Train loss") plt.plot(self.dev_loss_values, label="Validation loss") plt.plot(self.test_loss_values, label="Test loss") plt.legend(loc='best') plt.xlabel("Epochs") plt.ylabel("Loss") plt.title(self.model_name + " - has loss with lr = %.4f, embedding size = %r" % (self.learning_rate, self.embedding_size)) plt.show() def save_data(self): file = open(self.output_file_name, "w") file.write(", ".join(map(str, self.train_loss_values))) file.write("\n") file.write(", ".join(map(str, self.dev_loss_values))) file.write("\n") file.write(", ".join(map(str, self.test_loss_values))) file.write("\n") file.write(str(self.evaluate(self.dev))) file.write("\n") file.close()
class DataHandler: evaluator = Evaluate() # 클래스 변수 : 클래스의 모든 인스턴스들이 공유하는 변수 (연산기) # 객체합성 (선생님의 캡슐화 범위) # class method : 전역함수처럼 사용 @classmethod # 인스턴스 메소드로 정의해도 문제는 없다 def GetRawdataInDic(cls, filename): rawdata = {} with open(filename, 'rb') as f: while 1: try: data = pickle.load(f) except EOFError: break rawdata.update(data) return rawdata def __init__(self, filename, clsname): self.rawdata = DataHandler.GetRawdataInDic(filename) self.clsname = clsname self.cache = {} # 연산한 값을 저장해 두는 저장소 #(필요할 때 연산하되 이미 연산된 값이면 연산 없이 저장된 값을 반환) def get_scores(self): # cache기법 사용 if 'scores' not in self.cache: self.cache['scores'] = list(self.rawdata.values()) return self.cache.get('scores') def get_average(self): # cache 사용 if 'average' not in self.cache: self.cache['average'] = self.evaluator.average(self.get_scores()) return self.cache.get('average') def get_variance(self): # cache 사용 if 'variance' not in self.cache: vari = round( self.evaluator.variance(self.get_scores(), self.get_average()), 1) self.cache['variance'] = vari return self.cache.get('variance') def get_standard_deviation(self): if 'standard_deviation' not in self.cache: std_dev = round(math.sqrt(self.get_variance()), 1) self.cache['standard_deviation'] = std_dev return self.cache.get('standard_deviation') def GetEvaluation(self): print('*' * 50) print("%s 반 성적 분석 결과" % self.clsname) print("{0}반의 평균은 {1}점이고 분산은 {2}이며,따라서 표준편차는{3}이다".\ format(self.clsname, self.get_average(), self.get_variance()\ , self.get_standard_deviation())) print('*' * 50) print("%s 반 종합 평가" % self.clsname) print('*' * 50) self.evaluateClass() def evaluateClass(self): avrg = self.get_average() std_dev = self.get_standard_deviation() if avrg < 50 and std_dev > 20: print("성적이 너무 저조하고 학생들의 실력 차이가 너무 크다.") elif avrg > 50 and std_dev > 20: print("성적은 평균이상이지만 학생들 실력 차이가 크다. 주의 요망!") elif avrg < 50 and std_dev < 20: print("학생들간 실력차는 나지 않으나 성적이 너무 저조하다. 주의 요망!") elif avrg > 50 and std_dev < 20: print("성적도 평균 이상이고 학생들의 실력차도 크지 않다.") # def who_is_highest(self): # h_score= max(list(self.rawdata.values())) # for k, v in self.rawdata.items(): # if v == h_score: # return k # def get_highest_score(self): # return max(list(self.rawdata.values())) # 선생님 코드 def who_ist_highest(self): if 'highest' not in self.cache: self.cache['highest'] = reduce( lambda a, b: a if self.rawdata.get(a) > self.rawdata.get(b) else b, self.rawdata.keys()) return self.cache.get('highest') def get_highest_score(self): return self.rawdata[self.who_ist_highest()] def who_is_lowest(self): if 'lowest' not in self.cache: self.cache['lowest'] = reduce( lambda a, b: a if self.rawdata.get(a) < self.rawdata.get(b) else b, self.rawdata.keys()) return self.cache.get('lowest') def get_lowest_score(self): return self.rawdata[self.who_is_lowest()]
if fit: print("Fit tokenizer...") tokenizer = text.Tokenizer(nb_words=vocab_size) tokenizer.fit_on_texts(text_generator()) print("Save tokenizer...") f = open(tokenizer_fname, "wb") cPickle.dump(tokenizer, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() else: print('Load tokenizer...') f = open(tokenizer_fname, "rb") tokenizer = cPickle.load(f) f.close() evaluator = Evaluate(tokenizer, words, context, average_scores) sampling_table = sequence.make_sampling_table(vocab_size) for e in range(nb_epoch): print('-'*40) print('Epoch', e) print('-'*40) progbar = Progbar(tokenizer.document_count) samples_seen = 0 losses = [] batch_loss = [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset couples, labels = skipgrams_l2c_fast(seq, vocab_size, num_senses =num_senses, window_size=4, negative_samples=1., sampling_table=sampling_table) if couples:
#posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-LAFire.txt','pos-AirAsia.txt'] posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt'] #negFolder = 'neg' negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt'] ''' seedsFiles=['seedsURLs_z_501.txt','seedsURLs_z_540.txt'] #posFiles = ['pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt'] #negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt'] posFiles = ['pos-Charlie.txt','pos-sydneyseige.txt'] negFiles = ['neg-Charlie.txt','neg-sydneyseige.txt'] ''' evaluator = Evaluate() #for i in range(3): noK = 10 th = 0.75 i=3 posFile = posFiles[i] negFile = negFiles[i] #modelFile = modelFile +"-"+str(i)+".txt" #classifierFileName = 'classifier'+posFile.split(".")[0].split('-')[1]+".p" vsmClassifierFileName = 'classifierVSM-'+posFile.split(".")[0].split('-')[1]+".p" #evaluator.buildClassifier(posFile,negFolder,classifierFileName) #evaluator.buildClassifier(posFile,negFile,classifierFileName) evaluator.buildVSMClassifier(posFile, vsmClassifierFileName,th,noK) v = 0
def doEvaluate(): eva = Evaluate() eva.eval()
def main(): args = parse_args() model_dir = args.model_dir """LOAD CONFIG FILE""" config_files = glob.glob(os.path.join(model_dir, '*.ini')) assert len(config_files) == 1, 'Put only one config file in the directory' config_file = config_files[0] config = configparser.ConfigParser() config.read(config_file) """LOGGER""" logger = getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') sh = logging.StreamHandler() sh.setLevel(logging.INFO) sh.setFormatter(formatter) logger.addHandler(sh) log_file = model_dir + 'log.txt' fh = logging.FileHandler(log_file) fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) logger.info('[Training start] logging to {}'.format(log_file)) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_type = config['Parameter']['vocab_type'] vocab_size = int(config['Parameter']['vocab_size']) """TRINING DETAIL""" gpu_id = args.gpu n_epoch = args.epoch batch_size = args.batch interval = args.interval """DATASET""" train_src_file = config['Dataset']['train_src_file'] train_trg_file = config['Dataset']['train_trg_file'] valid_src_file = config['Dataset']['valid_src_file'] valid_trg_file = config['Dataset']['valid_trg_file'] test_src_file = config['Dataset']['test_src_file'] correct_txt_file = config['Dataset']['correct_txt_file'] train_data_size = dataset.data_size(train_trg_file) valid_data_size = dataset.data_size(valid_trg_file) logger.info('train size: {0}, valid size: {1}'.format(train_data_size, valid_data_size)) if vocab_type == 'normal': init_vocab = {'<unk>': 0, '<s>': 1, '</s>': 2, '<eod>': 3} vocab = dataset.VocabNormal() vocab.make_vocab(train_src_file, train_trg_file, init_vocab, vocab_size, freq=0) dataset.save_pickle(model_dir + 'src_vocab.pkl', vocab.src_vocab) dataset.save_pickle(model_dir + 'trg_vocab.pkl', vocab.trg_vocab) sos = vocab.src_vocab['<s>'] eos = vocab.src_vocab['</s>'] eod = vocab.src_vocab['<eod>'] elif vocab_type == 'subword': vocab = dataset.VocabSubword() if os.path.isfile(model_dir + 'src_vocab.sub.model') and os.path.isfile(model_dir + 'trg_vocab.sub.model'): vocab.load_vocab(model_dir + 'src_vocab.sub.model', model_dir + 'trg_vocab.sub.model') else: vocab.make_vocab(train_trg_file + '.sub', train_trg_file + '.sub', model_dir, vocab_size) sos = vocab.src_vocab.PieceToId('<s>') eos = vocab.src_vocab.PieceToId('</s>') eod = vocab.src_vocab.PieceToId('<eod>') src_vocab_size = len(vocab.src_vocab) trg_vocab_size = len(vocab.trg_vocab) logger.info('src_vocab size: {}, trg_vocab size: {}'.format(src_vocab_size, trg_vocab_size)) train_iter = iterator.Iterator(train_src_file, train_trg_file, batch_size, sort=True, shuffle=True) # train_iter = iterator.Iterator(train_src_file, train_trg_file, batch_size, sort=False, shuffle=False) valid_iter = iterator.Iterator(valid_src_file, valid_trg_file, batch_size, sort=False, shuffle=False) evaluater = Evaluate(correct_txt_file) test_iter = iterator.Iterator(test_src_file, test_src_file, batch_size, sort=False, shuffle=False) """MODEL""" model = HiSeq2SeqModel( WordEnc(src_vocab_size, embed_size, hidden_size, dropout_ratio), WordDec(trg_vocab_size, embed_size, hidden_size, dropout_ratio), SentEnc(hidden_size, dropout_ratio), SentDec(hidden_size, dropout_ratio), sos, eos, eod) """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """TRAIN""" sum_loss = 0 loss_dic = {} for epoch in range(1, n_epoch + 1): for i, batch in enumerate(train_iter.generate(), start=1): print(batch) exit() batch = vocab.convert2label(batch) data = converter.convert(batch, gpu_id) loss = optimizer.target(*data) sum_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() if i % interval == 0: logger.info('E{} ## iteration:{}, loss:{}'.format(epoch, i, sum_loss)) sum_loss = 0 chainer.serializers.save_npz(model_dir + 'model_epoch_{}.npz'.format(epoch), model) # chainer.serializers.save_npz(model_dir + 'optimizer_epoch{0}.npz'.format(epoch), optimizer) """EVALUATE""" valid_loss = 0 for batch in valid_iter.generate(): batch = vocab.convert2label(batch) data = converter.convert(batch, gpu_id) with chainer.no_backprop_mode(), chainer.using_config('train', False): valid_loss += optimizer.target(*data).data logger.info('E{} ## val loss:{}'.format(epoch, valid_loss)) loss_dic[epoch] = valid_loss """TEST""" output = [] for batch in test_iter.generate(): # batch: (articlesのリスト, abstracts_sosのリスト, abstracts_eosのリスト)タプル batch = vocab.convert2label(batch) data = converter.convert(batch, gpu_id) """ out: [(sent, attn), (sent, attn), ...] <-バッチサイズ sent: decodeされた文のリスト attn: 各文のdecode時のattentionのリスト """ with chainer.no_backprop_mode(), chainer.using_config('train', False): out = model.generate(data[0], data[3]) output.extend(out) res_decode = [] res_attn = [] for o in output: sent, attn = o sentence = dataset.to_list(sent) sentence = dataset.eod_truncate(sentence, eod) sent_num = len(sentence) sentence = [dataset.eos_truncate(s, eos) for s in sentence] sentence = [vocab.label2word(s) for s in sentence] sentence = dataset.join_sentences(sentence) res_decode.append(sentence) attn = np.sum(np.array(attn[:sent_num]), axis=0) / sent_num res_attn.append(attn) rank_list = evaluater.rank(res_attn) single = evaluater.single(rank_list) multiple = evaluater.multiple(rank_list) logger.info('E{} ## precision'.format(epoch)) logger.info('single: {} | {}'.format(single[0], single[1])) logger.info('multi : {} | {}'.format(multiple[0], multiple[1])) with open(model_dir + 'model_epoch_{}.hypo'.format(epoch), 'w')as f: [f.write(r + '\n') for r in res_decode] with open(model_dir + 'model_epoch_{}.attn'.format(epoch), 'w')as f: [f.write('{}\n'.format(r)) for r in res_attn] with open(model_dir + 'model_epoch_{}.prec'.format(epoch), 'w')as f: f.write('single\n') f.write(single[0] + '\n') f.write(single[1] + '\n') f.write('multiple\n') f.write(multiple[0] + '\n') f.write(multiple[1] + '\n') """MODEL SAVE""" best_epoch = min(loss_dic, key=(lambda x: loss_dic[x])) logger.info('best_epoch:{0}'.format(best_epoch)) chainer.serializers.save_npz(model_dir + 'best_model.npz', model)
def test(test_filename, time_now, title, **kwargs): # 超参数设置 network_name = get_value_or_default(kwargs, 'network', default='LSTM') affect = get_value_or_default(kwargs, 'affect', default=30) filename = test_filename column = get_value_or_default(kwargs, 'column', default='ClPr') index_col = 'TrdDt' batch_size = 1 plot_name = get_value_or_default(kwargs, 'plot_name', default=['fig1', ]) # 加载数据 data = Action.generate_df( filename, column, index_col, affect ) data_loader = DataLoader(data['dataset'], batch_size=batch_size, shuffle=False) net = torch.load('save/{}.pt'.format(network_name)) predict = list() for tx, ty in data_loader: output = net(tx.reshape(1, batch_size, affect)) output = output.reshape(1).detach() predict.append(float(output) * data['std'] + data['mean']) plt1 = Plot(1, time_now, network_name) plt1.plot(data['index'], data['real_data'][affect:], 'real data') plt1.plot(data['index'], predict, 'predict data') plt1.title(title, zh=True) plt1.xylabel('Datetime', 'price') plt1.save(plot_name[0]) # Plot.show() Plot.cla() evaluator = Evaluate(title, data['real_data'][affect:], predict) logger = Logger('test.log') basic_info = 'tested {}.'.format(network_name) logger.set_log(basic_info, t=time_now, filename=filename, column=column, affect_days=affect, network=net, plot_name=plot_name, MSELoss=evaluator.MSELoss(), DA=evaluator.DA(), Theil=evaluator.Theil_U(), L1Loss=evaluator.L1Loss(), Customize=evaluator.customize(), title=title, MAPE=evaluator.MAPE(), R=evaluator.R() ) f_out = open('log/{}.txt'.format(title), 'w') print('{} = {}'.format('time', time_now), '{} = {}'.format('MSELoss', evaluator.MSELoss()), '{} = {}'.format('DA', evaluator.DA()), '{} = {}'.format('Theil_U', evaluator.Theil_U()), '{} = {}'.format('L1Loss', evaluator.L1Loss()), '{} = {}'.format('MAPE', evaluator.MAPE()), '{} = {}'.format('R', evaluator.R()), file=f_out, sep='\n') f_out.close() return evaluator
def evaluate(): eval = Evaluate(self.arch, self.params, self.train_dir)
#!/usr/bin/env python from evaluate import Evaluate if __name__ == '__main__': ev = Evaluate(timesteps=2000) exp = './' filename = exp + '/output_truth.txt' truth = ev.get_data(filename) filename = exp + '/output_bckgd.txt' analy = ev.get_data(filename) # print('file :',filename) # print('truth : ',truth.shape,' analysis ',analy.shape) ev.plot_state(truth, analy)
def main(train_file_to_use, test_file_to_use, test_type, features_combination_list, lamda, comp): # for perm in itertools.combinations(features_combination_list_sub, 4): # features_combination_list.append(list(perm)) # start all combination of features for features_combination in features_combination_list: print('{}: Start creating MEMM for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) logging.info('{}: Start creating MEMM for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) train_start_time = time.time() memm_class = MEMM(directory, train_file_to_use, features_combination) logging.info('{}: Finish MEMM for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) print('{}: Finish MEMM for features : {}'.format(time.asctime(time.localtime(time.time())), features_combination)) print('{}: Start gradient for features : {} and lambda: {}'. format(time.asctime(time.localtime(time.time())), features_combination, lamda)) logging.info('{}: Start gradient for features : {} and lambda: {}'. format(time.asctime(time.localtime(time.time())), features_combination, lamda)) gradient_class = Gradient(model=memm_class, lambda_value=lamda) gradient_result = gradient_class.gradient_descent() train_run_time = (time.time() - train_start_time) / 60.0 print('{}: Finish gradient for features : {} and lambda: {}. run time: {}'. format(time.asctime(time.localtime(time.time())), features_combination, lamda, train_run_time)) logging.info('{}: Finish gradient for features : {} and lambda: {}. run time: {}'. format(time.asctime(time.localtime(time.time())), features_combination, lamda, train_run_time)) weights = gradient_result.x # np.savetxt(gradient_file, weights, delimiter=",") viterbi_start_time = time.time() print('{}: Start viterbi'.format((time.asctime(time.localtime(time.time()))))) viterbi_class = viterbi(memm_class, data_file=test_file_to_use, w=weights) viterbi_result = viterbi_class.viterbi_all_data viterbi_run_time = (time.time() - viterbi_start_time) / 60.0 print('{}: Finish viterbi. run time: {}'.format((time.asctime(time.localtime(time.time()))), viterbi_run_time)) logging.info('{}: Finish viterbi. run time: {}'.format((time.asctime(time.localtime(time.time()))), viterbi_run_time)) write_file_name = datetime.now().strftime(directory + 'file_results/result_MEMM_most_common_tags_' + test_type + '%d_%m_%Y_%H_%M.wtag') confusion_file_name = datetime.now().strftime(directory + 'confusion_files/CM_MEMM_most_common_tags_' + test_type + '%d_%m_%Y_%H_%M.xls') evaluate_class = Evaluate(memm_class, test_file_to_use, viterbi_result, write_file_name, confusion_file_name, comp=comp) if not comp: word_results_dictionary = evaluate_class.run() if comp: evaluate_class.write_result_doc() logging.info('{}: The model hyper parameters: \n lambda:{} \n test file: {} \n train file: {}' .format(time.asctime(time.localtime(time.time())), lamda, test_file_to_use, train_file_to_use)) logging.info('{}: Related results files are: \n {} \n {}'. format(time.asctime(time.localtime(time.time())), write_file_name, confusion_file_name)) # print(word_results_dictionary) summary_file_name = '{0}analysis/summary_{1}_{2.day}_{2.month}_{2.year}_{2.hour}_{2.minute}.csv' \ .format(directory, test_type, datetime.now()) evaluate_class.create_summary_file(lamda, features_combination, test_file_to_use, train_file_to_use, summary_file_name, gradient_class.file_name, comp) logging.info('{}: Following Evaluation results for features {}'. format(time.asctime(time.localtime(time.time())), features_combination)) if not comp: logging.info('{}: Evaluation results are: \n {} \n'.format(time.asctime(time.localtime(time.time())), word_results_dictionary)) logging.info('-----------------------------------------------------------------------------------')
def load_model(): s = Summarizer() e = Evaluate() return s, e
Created on Tue Jun 16 17:57:09 2015 @author: Paco """ from utils import Utils from evaluate import Evaluate from metrics import Metrics # Load data u = Utils() train_hard = u.load_matrix('data/data_train_difficile.mat') #generate pairs pairs_idx, pairs_label = u.generate_pairs(train_hard['label'], 1000, 0.1) # Calculate distance m = Metrics() dist = m.braycurtis_dist(train_hard['X'], pairs_idx) # Evaluate model e = Evaluate() e.evaluation(pairs_label,dist) # display results e.display_roc() e.hard_score() # Evaluate test dataset and save it test_hard = u.load_matrix('data/data_test_difficile.mat') dist_test = m.braycurtis_dist(test_hard['X'], test_hard['pairs']) u.save_test(dist_test,filetxt='soumission_dur.txt')
def trainer(epochs, model, optimizer, scheduler, train_dataloader, test_dataloader, batch_train, batch_test, device): max_grad_norm = 1.0 train_loss_set = [] for e in trange(epochs, desc="Epoch"): while gc.collect() > 0: pass # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # if e > 8: # model.freeze_bert() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_adj, b_adj_mwe, b_labels, b_target_idx, _ = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass ### For BERT + GCN and MWE loss = model(b_input_ids.to(device), adj=b_adj, adj_mwe=b_adj_mwe ,attention_mask=b_input_mask.to(device), \ labels=b_labels, batch=batch_train, target_token_idx=b_target_idx.to(device)) train_loss_set.append(loss.item()) # Backward pass loss.backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Update parameters and take a step using the computed gradient optimizer.step() scheduler.step() optimizer.zero_grad() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) # Validation # Put model in evaluation mode to evaluate loss on the validation set model.eval() all_preds = torch.FloatTensor() all_labels = torch.LongTensor() test_indices = torch.LongTensor() # Evaluate data for one epoch for batch in test_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_adj, b_adj_mwe, b_labels, b_target_idx, test_idx = batch # Telling the model not to compute or store gradients, saving memory and speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions ### For BERT + GCN and MWE logits = model(b_input_ids.to(device), adj=b_adj, adj_mwe=b_adj_mwe, attention_mask=b_input_mask.to(device), \ batch=batch_test, target_token_idx=b_target_idx.to(device)) # Move logits and labels to CPU logits = logits.detach().cpu() label_ids = b_labels.cpu() test_idx = test_idx.cpu() all_preds = torch.cat([all_preds, logits]) all_labels = torch.cat([all_labels, label_ids]) test_indices = torch.cat([test_indices, test_idx]) scores = Evaluate(all_preds, all_labels) print('scores.accuracy()={}\nscores.precision_recall_fscore()={}'.format( scores.accuracy(), scores.precision_recall_fscore())) return scores, all_preds, all_labels, test_indices
class Segmenter(object): def __init__(self, hdfs_client, flags): self.train_is_alive = False self.hdfs_client = hdfs_client self.flags = flags self.data_utils = DataUtils() def update_config(self): config_path = os.path.join(self.flags.raw_data_path, 'config.json') try: with open(config_path, encoding='utf-8', mode='r') as data_file: config_json = json.load(data_file) if 'use_lstm' in config_json: self.flags.use_lstm = config_json['use_lstm'] elif 'use_dynamic_rnn' in config_json: self.flags.use_dynamic_rnn = config_json['use_dynamic_rnn'] elif 'use_bidirectional_rnn' in config_json: self.flags.use_bidirectional_rnn = config_json[ 'use_bidirectional_rnn'] elif 'vocab_drop_limit' in config_json: self.flags.vocab_drop_limit = config_json[ 'vocab_drop_limit'] elif 'batch_size' in config_json: self.flags.batch_size = config_json['batch_size'] elif 'num_steps' in config_json: self.flags.num_steps = config_json['num_steps'] elif 'num_layer' in config_json: self.flags.num_layer = config_json['num_layer'] elif 'embedding_size' in config_json: self.flags.embedding_size = config_json['embedding_size'] elif 'learning_rate' in config_json: self.flags.learning_rate = config_json['learning_rate'] elif 'learning_rate_decay_factor' in config_json: self.flags.learning_rate_decay_factor = config_json[ 'learning_rate_decay_factor'] elif 'keep_prob' in config_json: self.flags.keep_prob = config_json['keep_prob'] elif 'clip_norm' in config_json: self.flags.clip_norm = config_json['clip_norm'] except: raise Exception('ERROR: config.json content invalid') def train(self): self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'train.txt'), os.path.join(self.flags.datasets_path, 'train.txt')) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'test.txt')) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'train.txt'), os.path.join(self.flags.datasets_path, 'label_train.txt')) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'label_test.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_train.txt'), os.path.join(self.flags.datasets_path, 'split_train.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_test.txt'), os.path.join(self.flags.datasets_path, 'split_test.txt')) words_vocab, labels_vocab = self.data_utils.create_vocabulary( os.path.join(self.flags.datasets_path, 'split_train.txt'), self.flags.vocab_path, self.flags.vocab_drop_limit) train_word_ids_list, train_label_ids_list = self.data_utils.file_to_word_ids( os.path.join(self.flags.datasets_path, 'split_train.txt'), words_vocab, labels_vocab) test_word_ids_list, test_label_ids_list = self.data_utils.file_to_word_ids( os.path.join(self.flags.datasets_path, 'split_test.txt'), words_vocab, labels_vocab) tensorflow_utils = TensorflowUtils() tensorflow_utils.create_record( train_word_ids_list, train_label_ids_list, os.path.join(self.flags.tfrecords_path, 'train.tfrecords')) tensorflow_utils.create_record( test_word_ids_list, test_label_ids_list, os.path.join(self.flags.tfrecords_path, 'test.tfrecords')) self.hdfs_client.hdfs_upload( self.flags.vocab_path, os.path.join(self.flags.output_path, os.path.basename(self.flags.vocab_path))) train = Train() train.train() def upload_tensorboard(self): hdfs_tensorboard_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.tensorboard_path))) temp_hdfs_tensorboard_path = hdfs_tensorboard_path + '-temp' self.hdfs_client.hdfs_upload(self.flags.tensorboard_path, temp_hdfs_tensorboard_path) self.hdfs_client.hdfs_delete(hdfs_tensorboard_path) self.hdfs_client.hdfs_mv(temp_hdfs_tensorboard_path, hdfs_tensorboard_path) def log_monitor(self): while (self.train_is_alive): time.sleep(120) self.upload_tensorboard() def upload_model(self): predict = Predict() predict.saved_model_pb() hdfs_checkpoint_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.checkpoint_path))) hdfs_saved_model_path = os.path.join( self.flags.output_path, os.path.basename(os.path.normpath(self.flags.saved_model_path))) temp_hdfs_checkpoint_path = hdfs_checkpoint_path + '-temp' temp_hdfs_saved_model_path = hdfs_saved_model_path + '-temp' self.hdfs_client.hdfs_upload(self.flags.checkpoint_path, temp_hdfs_checkpoint_path) self.hdfs_client.hdfs_upload(self.flags.saved_model_path, temp_hdfs_saved_model_path) self.hdfs_client.hdfs_delete(hdfs_checkpoint_path) self.hdfs_client.hdfs_delete(hdfs_saved_model_path) self.hdfs_client.hdfs_mv(temp_hdfs_checkpoint_path, hdfs_checkpoint_path) self.hdfs_client.hdfs_mv(temp_hdfs_saved_model_path, hdfs_saved_model_path) def evaluate(self): shutil.rmtree(self.flags.vocab_path) shutil.rmtree(self.flags.checkpoint_path) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, os.path.basename(self.flags.vocab_path)), self.flags.vocab_path) self.hdfs_client.hdfs_download( os.path.join(self.flags.input_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'test.txt')) hdfs_checkpoint_path = os.path.join( self.flags.input_path, os.path.basename(self.flags.checkpoint_path)) self.hdfs_client.hdfs_download(hdfs_checkpoint_path, self.flags.checkpoint_path) self.data_utils.label_segment_file( os.path.join(self.flags.datasets_path, 'test.txt'), os.path.join(self.flags.datasets_path, 'label_test.txt')) self.data_utils.split_label_file( os.path.join(self.flags.datasets_path, 'label_test.txt'), os.path.join(self.flags.datasets_path, 'split_test.txt')) predict = Predict() predict.file_predict( os.path.join(self.flags.datasets_path, 'split_test.txt'), os.path.join(self.flags.datasets_path, 'test_predict.txt')) self.model_evaluate = Evaluate() self.model_evaluate.evaluate( os.path.join(self.flags.datasets_path, 'test_predict.txt'), os.path.join(self.flags.datasets_path, 'test_evaluate.txt')) self.hdfs_client.hdfs_delete( os.path.join(self.flags.output_path, 'test_evaluate.txt')) self.hdfs_client.hdfs_upload( os.path.join(self.flags.datasets_path, 'test_evaluate.txt'), os.path.join(self.flags.input_path, 'test_evaluate.txt'))
p2 = os.path.join(path, "a-" + file) al = align.face_features(p, p2) ev = utils.parse_evaluate(al, args.parsing_checkpoint, cuda=cuda) p = os.path.join(path, "b-" + file) cv2.imwrite(p, ev) ev = 255 - utils.img_edge(ev) p = os.path.join(path, "c-" + file) cv2.imwrite(p, ev) elif args.phase == "dataset": dataset = FaceDataset(args, "test") dataset.pre_process(cuda) elif args.phase == "preview": log.info("preview picture") path = "../export/regular/model.jpg" img = cv2.imread(path) img2 = utils.parse_evaluate(img, args.parsing_checkpoint, cuda) img3 = utils.img_edge(img2) img3_ = ops.fill_grey(img3) img4 = align.face_features(path) log.info("{0} {1} {2} {3}".format(img.shape, img2.shape, img3_.shape, img4.shape)) ops.merge_4image(img, img2, img3_, img4, show=True) elif args.phase == "evaluate": log.info("evaluation mode start") evl = Evaluate(args, cuda=cuda) img = cv2.imread(args.eval_image).astype(np.float32) x_ = evl.itr_train(img) evl.output(x_, img) else: log.error("not known phase %s", args.phase)
inp[el] = self.means[0, el] # convert the input list to a numpy matric and normalise it inp = np.matrix([inp]) inp = (inp - self.means) / self.std # get a result and prediction using the logistic function result = (self.coef * inp.T)[0, 0] + self.bias prediction = 1.0 / (1 + np.exp(-result)) assert prediction <= 1 and prediction >= 0 return prediction # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # if __name__ == "__main__": m = LogReg() predictions = m.make_prediction_set( 2000, 2016, "2000-2016 (it9)", testStart=2015, slamsOnly=False, trainName='LogReg_final', inputProcessedFile="2000-2016 (it9)-processed") e = Evaluate(predictions) print('Final log reg model') e.display_summary()
ftext.close() f.close() furl.close() res = evaluator.evaluateFC(rp) writeEvaluation(res,evalFilename) print sum(res) print len(res) if __name__ == "__main__": seedsFiles=['Output-CharlestonShooting.txt','seeds-Sandra.txt','Output-tunisiaHotelAttack.txt','Output-samesexmarriage.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','seeds_459.txt','seeds_474.txt','seedsURLs_z_534.txt','seedsURLs_z_501.txt','seedsURLs_z_540.txt'] posFiles = ['charlestonShootingPos.txt','evaluate-SandraBland.txt','pos-tunisiaHotelAttack.txt','pos-samesexmarriage.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt'] negFiles = ['charlestonShootingNeg.txt','neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt'] modelFiles = ['Output-CharlestonShooting.txt','model-SandraBland.txt','model-tunisiaHotelAttack.txt','model-samesexmarriage.txt','model-CharlestonShooting.txt'] evaluator = Evaluate() #for i in range(3): pagesLimit = 300 seedsFiles=['seeds-Sandra.txt','Output-tunisiaHotelAttack.txt','Output-samesexmarriage.txt','Output-CharlestonShooting.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','seeds_459.txt','seeds_474.txt','seedsURLs_z_534.txt','seedsURLs_z_501.txt','seedsURLs_z_540.txt'] posFiles = ['evaluate-SandraBland.txt','pos-tunisiaHotelAttack.txt','pos-samesexmarriage.txt','pos-CharlestonShooting.txt','Output-fifaArrests.txt','Output-boatCapsized.txt','Output-nepalEarthquake.txt','pos-FSU.txt','pos-Hagupit.txt','pos-AirAsia.txt','pos-sydneyseige.txt','pos-Charlie.txt'] negFiles = ['neg-FSU.txt','neg-Hagupit.txt','neg-AirAsia.txt','neg-sydneyseige.txt','neg-Charlie.txt'] modelFiles = ['model-SandraBland.txt','model-tunisiaHotelAttack.txt','model-samesexmarriage.txt','model-CharlestonShooting.txt'] evaluator = Evaluate() #for i in range(3): pagesLimit = 100 noK = 5 pageTh = 0.2 urlsTh = 0 i=0
from board import Board, PieceStack, Turn, get_piece_text, EMPTY from evaluate import Evaluate, WIN from random import randint from search import RootOfAlphaBetaSearch piecestack = PieceStack() turn = Turn() board = Board() evaluate = Evaluate() def UserTurn(piecestack, board, piece): board.show() piecestack.show() piecestack.TakePiece(piece) print('Piece: {0}'.format(get_piece_text(piece))) while True: x, y = [ int(i) - 1 for i in raw_input( "Enter x y coordinates to place piece: ").split() ] if board.pieces[x][y] is EMPTY: break else: print('Square is not empty. Try another one.')
def classifier_test(): # setting dataset_para = '[email protected]@partition@selection' # 选择使用哪些特征 feature_para = (1, 2, 3, 4) # file directory # feature_dir = dataset_dir + r'\feature1' # 预处理:从原始数据yoochoose-data中提取出实验数据所需要部分数据(根据实验数据session进行提取) # 输入1:(实验数据)dataset_dir\train\session_item.txt .\test\session_item.txt # 输入2:(yoochoose-data)yoochoose_data_dir\yoochoose-clicks.dat .\yoochoose-buys.dat .\yoochoose-test.dat # 输出:dataset_dir\yoochoose-selected\yoochoose-clicks-selected.dat .\yoochoose-buys-selected.dat .\yoochoose-test-selected.dat dataset_dir = r'E:\ranking aggregation\dataset\yoochoose\Full' + '\\' + dataset_para yoochoose_data_dir = r'E:\recsyschallenge2015\mycode\yoochoose-data' # 输出路径 yoochoose_selected_dir = dataset_dir + r'\yoochoose-selected' # 假如输出文件夹不存在,则创建文件夹 # if not os.path.exists(yoochoose_selected_dir): # os.makedirs(yoochoose_selected_dir) # Preprocess2.extract_data(dataset_dir, yoochoose_data_dir, yoochoose_selected_dir) # 提取特征 # 输入:yoochoose selected data(及groundtruth) # 输出:特征 feature_dir = r'E:\recsyschallenge2015\mycode\result-data' # feature_dir = dataset_dir + r'\feature1' # 假如输出文件夹不存在,则创建文件夹 if not os.path.exists(feature_dir): os.makedirs(feature_dir) print('feature_para:', feature_para) Feature4.go(dataset_dir, feature_dir, feature_para) # 读取特征 X_train, y_train = Input2.read_train(feature_dir) X_test, y_test, test_dic_data, session_item_data, session_idx_dic = Input2.read_test( dataset_dir, feature_dir) groundtruth_path = dataset_dir + r'\test\session_item.txt' # 模型部分 print('model: LogisticRegression') model = LogisticRegression() model.fit(X_train, y_train) # print(model) # make predictions y_predict = model.predict(X_test) # 结果评估 solution = Solution.generate(test_dic_data, y_predict) Evaluate.go(solution, groundtruth_path) # 模型部分 print('model: GaussianNB') model = GaussianNB() model.fit(X_train, y_train) # print(model) # make predictions y_predict = model.predict(X_test) # 结果评估 solution = Solution.generate(test_dic_data, y_predict) Evaluate.go(solution, groundtruth_path) # 模型部分 print('model: SVM') model = SVC() model.fit(X_train, y_train) # print(model) # make predictions y_predict = model.predict(X_test) # 结果评估 solution = Solution.generate(test_dic_data, y_predict) Evaluate.go(solution, groundtruth_path)
class DataHandler: #클래스 멤버: 연산기 하나 evaluator = Evaluate() #class method : 전역함수처럼 쓸 수 있다 @classmethod def GetRawdataInDic(cls, filename): rawdata = {} with open(filename, 'rb') as f: while 1: try: data = pickle.load(f) except EOFError: break rawdata.update(data) return rawdata def __init__(self, filename, clsname): self.rawdata = DataHandler.GetRawdataInDic(filename) self.clsname = clsname #연산한 값을 저장해두는 저장소 #필요할 떄 연산하되, 이미 연산된 값이면 연산없이 저장된 값을 반환 self.cache = {} def get_scores(self): if 'scores' not in self.cache: self.cache['scores'] = list(self.rawdata.values()) return self.cache.get('scores') #cache def get_average(self): if 'average' not in self.cache: self.cache['average'] = self.evaluator.average(self.get_scores()) return self.cache.get('average') def get_variance(self): if 'variace' not in self.cache: vari = round( self.evaluator.variance(self.get_scores(), self.get_average())) self.cache['variance'] = vari return self.cache.get('variance') def get_standard_deviation(self): if "standard_deviation" not in self.cache: std_dev = round(math.sqrt(self.get_variance()), 1) self.cache["standard_deviation"] = std_dev return self.cache.get("standard_deviation") def WhoIsHighest(self): if 'highest' not in self.cache: self.cache['highest'] = reduce( lambda a, b: a if self.rawdata.get(a) > self.rawdata.get(b) else b, self.rawdata.keys()) return self.cache.get('highest') def GetHighestScore(self): return self.rawdata[self.WhoIsHighest()] def WhoIsLowest(self): if "lowest" not in self.cache: self.cache['lowest'] = reduce( lambda a, b: a if self.rawdata.get(a) < self.rawdata.get(b) else b, self.rawdata.keys()) return self.cache.get('lowest') def GetLowestScore(self): return self.rawdata[self.WhoIsLowest()] def get_evaluation(self): print('*' * 50) print("%s 반 성적 분석 결과" % self.clsname) print("{0}반의 평균은 {1}점이고 분산은 {2}이며,따라서 표준편차는{3}이다".\ format(self.clsname, self.get_average(), self.get_variance()\ , self.get_standard_deviation())) print('*' * 50) print("%s 반 종합 평가" % self.clsname) print('*' * 50) self.evaluateClass() def evaluateclass(self): avrg = self.get_average() std_dev = self.get_standard_deviation() if avrg < 50 and std_dev > 20: print("성적이 너무 저조하고 학생들의 실력 차이가 너무 크다.") elif avrg > 50 and std_dev > 20: print("성적은 평균이상이지만 학생들 실력 차이가 크다. 주의 요망!") elif avrg < 50 and std_dev < 20: print("학생들간 실력차는 나지 않으나 성적이 너무 저조하다. 주의 요망!") elif avrg > 50 and std_dev < 20: print("성적도 평균 이상이고 학생들의 실력차도 크지 않다.")
def main(): # action space actionSpace = [[10, 0], [7, 7], [0, 10], [-7, 7], [-10, 0], [-7, -7], [0, -10], [7, -7]] numActionSpace = len(actionSpace) # state space numStateSpace = 4 xBoundary = [0, 360] yBoundary = [0, 360] checkBoundaryAndAdjust = ag.CheckBoundaryAndAdjust(xBoundary, yBoundary) initSheepPositionMean = np.array([180, 180]) initWolfPositionMean = np.array([180, 180]) initSheepPositionNoise = np.array([120, 120]) initWolfPositionNoise = np.array([60, 60]) sheepPositionReset = ag.SheepPositionReset(initSheepPositionMean, initSheepPositionNoise, checkBoundaryAndAdjust) wolfPositionReset = ag.WolfPositionReset(initWolfPositionMean, initWolfPositionNoise, checkBoundaryAndAdjust) numOneAgentState = 2 positionIndex = [0, 1] sheepPositionTransition = ag.SheepPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) wolfPositionTransition = ag.WolfPositionTransition(numOneAgentState, positionIndex, checkBoundaryAndAdjust) numAgent = 2 sheepId = 0 wolfId = 1 transitionFunction = env.TransitionFunction(sheepId, wolfId, sheepPositionReset, wolfPositionReset, sheepPositionTransition, wolfPositionTransition) minDistance = 15 isTerminal = env.IsTerminal(sheepId, wolfId, numOneAgentState, positionIndex, minDistance) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) screenColor = [255, 255, 255] circleColorList = [[50, 255, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50], [50, 50, 50]] circleSize = 8 saveImage = False saveImageFile = 'image' render = env.Render(numAgent, numOneAgentState, positionIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageFile) aliveBouns = -1 deathPenalty = 20 rewardDecay = 0.99 rewardFunction = reward.TerminalPenalty(sheepId, wolfId, numOneAgentState, positionIndex, aliveBouns, deathPenalty, isTerminal) accumulateRewards = PG.AccumulateRewards(rewardDecay, rewardFunction) maxTimeStep = 150 sampleTrajectory = PG.SampleTrajectory(maxTimeStep, transitionFunction, isTerminal) approximatePolicy = PG.ApproximatePolicy(actionSpace) trainPG = PG.TrainTensorflow(actionSpace) numTrajectory = 20 maxEpisode = 1000 # Generate models. learningRate = 1e-4 hiddenNeuronNumbers = [128, 256, 512, 1024] hiddenDepths = [2, 4, 8] # hiddenNeuronNumbers = [128] # hiddenDepths = [2] generateModel = GeneratePolicyNet(numStateSpace, numActionSpace, learningRate) models = {(n, d): generateModel(d, round(n / d)) for n, d in it.product(hiddenNeuronNumbers, hiddenDepths)} print("Models generated") # Train. policyGradient = PG.PolicyGradient(numTrajectory, maxEpisode, render) trainModel = lambda model: policyGradient(model, approximatePolicy, sampleTrajectory, accumulateRewards, trainPG) trainedModels = {key: trainModel(model) for key, model in models.items()} print("Finished training") # Evaluate modelEvaluate = Evaluate(numTrajectory, approximatePolicy, sampleTrajectory, rewardFunction) meanEpisodeRewards = {key: modelEvaluate(model) for key, model in trainedModels.items()} print("Finished evaluating") # print(meanEpisodeRewards) # Visualize independentVariableNames = ['NeuroTotalNumber', 'layerNumber'] draw(meanEpisodeRewards, independentVariableNames) print("Finished visualizing", meanEpisodeRewards)
axs[0].grid() axs[0].set_title('Loss') axs[1].plot(history['Train_dice'], label='Train Dice') axs[1].plot(history['Valid_dice'], label='Valid Dice') axs[1].legend() axs[1].grid() axs[1].set_title('Dice') plt.savefig('../output/loss_dice.png') ######################################################################## # Evaluate the network # get all predictions of the validation set: maybe a memory error here. if args.load_mod: # load the best model net.load_state_dict(torch.load(MODEL_FILE)) eva = Evaluate(net, device, validloader, args, isTest=False) eva.search_parameter() dice, dicPred, dicSubmit = eva.predict_dataloader() # eva.plot_sampled_predict() # evaluate the prediction sout = '\nFinal Dice {:.3f}\n'.format(dice) +\ '==============Predict===============\n' + \ analyze_labels(pd.DataFrame(dicPred)) # +\ # '==============True===============\n' + \ # analyze_labels(stat_df_valid) # print(sout) # print2file(sout, LOG_FILE) # print2file(' '.join(str(key)+':'+str(val) for key,val in eva.dicPara.items()), LOG_FILE) # load swa model
def __init__(self, n): self.data = DataManager('../data/train.csv','../data/test.csv', n) self.fe = FeatureExtractor(self.data) self.eval = Evaluate()
def playit(self): point = 0 evaluation_ai = Evaluate(self.board, True) aiMoves = evaluation_ai.checkPossibleMoves() depth = 0 print(aiMoves[0][0][0])
import numpy as np import json gen_data = False plot_fig = True basename = 'rmse' jsonfile = basename + '.json' print('gen_data/plot_fig/filename = ', gen_data, plot_fig, basename) mem_list = [6, 10] loc_list = np.arange(5, 60, 5).tolist() wgt_list = np.arange(0, 10, 1).tolist() if gen_data: ev = Evaluate() truth = ev.get_data('output_truth.txt') error = {} rmsedata = {} amedata = {} for mem in mem_list: rmseloc = {} ameloc = {} for loc in loc_list: rmse = [] ame = [] for wgt in wgt_list: filename = 'h3dw%2.2dl%2.2dm%2.2d/output_analy.txt' % ( wgt, loc, mem) # analysis
optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr) print(model) print(model_fname) # # train the model # for param in model.parameters(): # param.requires_grad = True model.train() train_loop() # evaluate the model if args.evaluate: if args.evaluate_on_cpu: device = "cpu" model = model.to(device) model.eval() if args.train: Evaluate(model, test_loader, outpath, args.target, device, args.n_epochs) elif args.load: Evaluate(model, test_loader, outpath, args.target, device, args.load_epoch) ## ----------------------------------------------------------- # # to retrieve a stored variable in pkl file # import pickle # with open('../../test_tmp_delphes/experiments/PFNet7_gen_ntrain_2_nepochs_3_batch_size_3_lr_0.0001/confusion_matrix_plots/cmT_normed_epoch_0.pkl', 'rb') as f: # Python 3: open(..., 'rb') # a = pickle.load(f)
class StanceDetector: def __init__(self, n): self.data = DataManager('../data/train.csv','../data/test.csv', n) self.fe = FeatureExtractor(self.data) self.eval = Evaluate() def buildBaseline(self, model): print 'Training baseline',model feats = ['words'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) for mode in ['simple','tfidf']: if model=='bayes': cl = MultinomialNB() elif model=='svm': cl = LinearSVC() if mode=='tfidf': cl = Pipeline([('tfidf', TfidfTransformer()), ('clf', cl), ]) clf = cl.fit(X, y) y_pred = clf.predict(X_test) print mode, accuracy_score(y_true, y_pred) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildSimple(self, model): feats = ['topicVecs','words2vec'] print feats y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) for mode in ['simple']:#,'tfidf']: if model=='bayes': cl = MultinomialNB() elif model=='svm': # cl = LinearSVC() cl = LinearSVC() cl = GridSearchCV(cl, self.getGridSearchParams()) if mode=='tfidf': cl = Pipeline([('tfidf', TfidfTransformer()), ('clf', cl), ]) clf = cl.fit(X, y) # print cl.best_params_ y_pred = clf.predict(X_test) print mode, accuracy_score(y_true, y_pred) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) #train in name means helper function def trainSVC(self, feats, y_attribute, proba=False): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = SVC(probability=proba) clf = clf.fit(X,y) if proba: y_proba = clf.predict_proba(X_test) return clf, y_proba else: y_pr = clf.predict(X_test) return clf, y_pr def trainLinearSVC(self, feats, y_attribute, dec=False): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC() clf = clf.fit(X,y) if dec: y_pr = clf.decision_function(X_test) return clf, y_pr else: y_pr = clf.predict(X_test) return clf, y_pr #TODO: revisit #check lable transform encodings of NONE, FAVOR, AGAINST # def buildTopicStanceSeparate(self): # feats = ['words'] # y_attribute = 'stance' # X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) # #builds two separate for topic and stance # topic_clf, y_topic_proba = self.trainLinearSVC(feats = ['words','lexiconsbyword'],y_attribute = 'topic',dec=True) # #WRONR # #WRONG # #WRONG # boost_factors = np.ones_like(y_true) # #multiply by NONE (0) = 0 # #multiply by FAVOR (1) = 1 # #multiply by AGAINST (2) = 2 # #has index of class with max prob for each sample # topic_preds = np.argmax(y_topic_proba,axis=1) # for ind,s in enumerate(y_topic_proba): # prob = y_topic_proba[ind][topic_preds[ind]] # if prob < 0.4: # boost_factors[ind] = 0 #corresponds to NONE # stance_clf,stance_pred = self.trainLinearSVC(feats = ['words','lexiconsbyword','topic'],y_attribute = 'stance') # # for i in range(0, len(stance_pred)): # # if boost_factors[i] == 2: # # stance_pred[i] = self.fe.labelenc.transform("NONE") # #with numpy arrays now, above is equivalent to below , right? # stance_pred = np.multiply(stance_pred, boost_factors) # stance_pred_labels = self.fe.labelenc.inverse_transform(stance_pred) # # print [(self.data.testLabels[i], stance_pred_labels[i]) for i in range(len(stance_pred))] # score = accuracy_score(y_true, stance_pred) # print score # pprint(self.eval.computeFscores(self.data.testTweets, stance_pred_labels)) def buildTopicOnlyMultiple(self): #one svm for each topic feats = ['words2vec'] y_attribute = 'stance' clf_topic = {} for topic in list(self.fe.topicenc.classes_): X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute, topic) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute, topic) clf = LinearSVC() clf = clf.fit(X,y) clf_topic[topic] = clf print topic, clf.score(Xt,yt) # not useful. still less than single SVM. but not as much as avg of above # X_whole,y_whole = self.fe.getFeaturesMatrix('train',feats,y_attribute) # Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) # newX = [] # newXt = [] # for topic in clf_topic: # newX.append(clf_topic[topic].transform(X_whole)) # newXt.append(clf_topic[topic].transform(Xt)) # newX = np.concatenate(tuple(newX),axis=1) # newXt = np.concatenate(tuple(newXt),axis=1) # newclf = LinearSVC() # newclf = newclf.fit(newX, y_whole) # print newclf.score(newXt, yt) def trainTopicSVM(self, topic): feats = ['words2vec','clusteredLexicons','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesTopicNontopic('train',feats,y_attribute, topic=topic) X_test,y_true = self.fe.getFeaturesTopicNontopic('test',feats,y_attribute, topic=topic) clf = LinearSVC() clf = GridSearchCV(clf,self.getGridSearchParams()) clf = clf.fit(X,y) print clf.best_params_ print topic #,clf.score(X_test, y_true) return clf #WRITE #WRITE #WRITE def buildTopicWise(self): #separate SVC for each topic, tests on that class only first, then on all topic_clf = {} feats = ['words2vec','clusteredLexicons','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,y_attribute) #X matrix for new classifier which uses this as train matrix #has columns of each topic classifier's confidence function # X_fx = [] # X_ftestx = [] preds = [] for topic in list(self.fe.topicenc.classes_): topic_clf[topic] = self.trainTopicSVM(topic) preds.append(topic_clf[topic].predict(X_test)) # X_fx.append(topic_clf[topic].decision_function(X)) # X_ftestx.append(topic_clf[topic].decision_function(X_test)) allpreds = np.vstack(tuple(preds)) topic1hot, temp = self.fe.getFeaturesMatrix('test',['topic1hot'],'stance') # print allpreds.shape, topic1hot.T.shape allpreds[allpreds==5] = 1 final_pred = np.multiply(topic1hot.T,allpreds) prediction = np.sum(final_pred, axis=0).astype(int) # X_fx = np.concatenate(tuple(X_fx), axis=1) # X_ftestx = np.concatenate(tuple(X_ftestx), axis=1) # clf = LinearSVC().fit(X_fx, y) # y_pred = clf.predict(X_ftestx) print accuracy_score(y_true, prediction) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(prediction))) #GOOD 66%acc #1.2 % increase with change topic to 1hot def buildSVMWord2Vec(self): feats = ['words2vec','topic1hot'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=0.01,penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildSVMTrial(self): feats = ['topic1hot','words2vec'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=0.001) clf = clf.fit(X,y) y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildTrial(self): # feats = ['pos','words2vec','clusteredLexicons','topic1hot'] # 'givenSentiment','givenOpinion' feats = ['words2vec','pos','clusteredLexicons','top1grams','top2grams'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) # clf = DecisionTreeClassifier() # clf = LogisticRegression() clf = LinearSVC(C=1, class_weight='balanced', penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) # print y_pred print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]),len(np.where(y_pred>1)[0]) print len(y_pred) print 'training accuracy',clf.score(X, y) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def buildGithubSGDModel(self): # feats = ['words2vec','topic1hot','pos'] y_attribute = 'stance' dataset = self.fe.getDataset('train') dataset2 = self.fe.getDataset('test') y_train = self.fe.getY('train',dataset, y_attribute) y_test = self.fe.getY('train',dataset2, y_attribute) tfidf = TfidfVectorizer(ngram_range=(1, 2), max_df=1.0, min_df=1, binary=True, norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=True, encoding='latin1') X_train = tfidf.fit_transform(self.data.trainTweetsText) X_test = tfidf.transform(self.data.testTweetsText) tuned_parameters = {'alpha': [10 ** a for a in range(-12, 0)]} clf = GridSearchCV(SGDClassifier(loss='hinge', penalty='elasticnet',l1_ratio=0.75, n_iter=10, shuffle=True, verbose=False, n_jobs=4, average=False) , tuned_parameters, cv=10, scoring='f1_weighted') clf.fit(X_train, y_train) print clf.best_params_ print("Grid scores on development set:") for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print classification_report(y_test, clf.predict(X_test)) print clf.score(X_test, y_test) def getGridSearchParams(self): param_grid = [ {'C': [0.001, 0.01, 0.1, 1], 'dual':[False, True],'class_weight':['balanced',None]} ] return param_grid def getGridSearchParamsForXGBoost(self): param_grid = [ {'n_estimators':[10,20,30,40,50], 'max_depth': [1,2,3,4,5]} ] def buildSVMWord2VecWithClusters(self): #feats = ['topic1hot'] #feats = ['words2vec', 'top1grams', 'top2grams'] #feats = ['words2vec', 'top1grams'] #feats = ['words2vec', 'top2grams'] feats = ['words2vec', 'clusteredLexicons', 'topic1hot', 'pos'] #feats = ['words2vec','topic1hot', 'pos','clusteredLexicons', 'top2grams'] #feats = ['clusteredLexicons'] #feats = ['pos'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) print (X.shape) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) clf = LinearSVC(C=1,penalty='l1',dual=False) clf = clf.fit(X,y) y_pred = clf.predict(Xt) # f = open('pred','w') # for i in y_pred: # #print type(i) # f.write('{0}'.format(i)) # f.close() accuracy = clf.score(Xt, yt) # print clf.score(Xt, yt) fscores = self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred)) # print type(fscores) # print fscores # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) # print (accuracy, fscores['Macro']) return (accuracy, fscores['Macro']) def buildSVMWord2VecWithClustersGridSearch(self): feats = ['words2vec','topic1hot','pos', 'clusteredLexicons'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) svmclf = LinearSVC(C=0.01,penalty='l1',dual=False) clf = GridSearchCV(svmclf, self.getGridSearchParams()) clf = clf.fit(X,y) print clf.best_params_ y_pred = clf.predict(Xt) print clf.score(Xt, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def trainStanceNone(self, feats): # feats = ['words2vec','topic1hot','pos'] X,y = self.fe.getFeaturesStanceNone('train',feats) Xt,yt = self.fe.getFeaturesStanceNone('test',feats) svmclf = LinearSVC() stance_none_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y) # print stance_none_clf.score(Xt, yt) pred = stance_none_clf.predict(Xt) print classification_report(yt, pred) return stance_none_clf def trainFavorAgainst(self,feats): # feats = ['words2vec','topic1hot','pos'] X,y = self.fe.getFeaturesFavorAgainst('train',feats) Xt,yt = self.fe.getFeaturesFavorAgainst('test',feats) svmclf = LinearSVC() fav_agnst_clf = GridSearchCV(svmclf, self.getGridSearchParams()).fit(X, y) pred = fav_agnst_clf.predict(Xt) print classification_report(yt, pred) # print fav_agnst_clf.score(Xt, yt) return fav_agnst_clf def buildModel2(self): #one SVM for Stance/None and other for Favor/Against feats = ['words2vec','topic1hot','pos'] print feats stance_none_clf = self.trainStanceNone(feats) fav_agnst_clf = self.trainFavorAgainst(feats) X_test,y_true = self.fe.getFeaturesMatrix('test',feats,'stance') st_pred = stance_none_clf.predict(X_test) favaga_pred = fav_agnst_clf.predict(X_test) for index,row in enumerate(st_pred): if row==3: st_pred[index] = favaga_pred[index] print classification_report(y_true, st_pred) print accuracy_score(y_true, st_pred) # assert(stance_none_clf.classes_[1]==3) #stance(3) # # >0 means this class - stance will be predicted # # <0 means none is predicted # confi = stance_none_clf.decision_function(X_test) # # treat as confident about none if confi<-0.25: # y_pred = fav_agnst_clf.predict(X_test) # print accuracy_score(y_true, y_pred) # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) # threshold = -0.25 # confi_high = np.where(confi<threshold)[0] # for loc in confi_high: # y_pred[loc] = self.fe.labelenc.transform('NONE') # print 'Boosted', accuracy_score(y_true, y_pred) # print len(np.where(y_pred==0)[0]),len(np.where(y_pred==1)[0]), len(np.where(y_pred==2)[0]), # pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) def get_proba_one(self, model, X): predicted = model.predict_proba(X) return predicted[:, 1] def runXGBoostModel(self,model, model_name, X, target, X_test, crossOn): print "Trying to fit model" print X.shape, target.shape model.fit(X, target) print "Successfully fit model" predicted = self.get_proba_one(model, X) predicted_test = self.get_proba_one(model, X_test) predicted_test = model.predict(X_test) print predicted_test return predicted_test def word2VecXGBoost(self): feats = ['words2vec','pos','clusteredLexicons', 'top1grams','top2grams', 'topic1hot' ] #feats = ['words2vec'] #feats = ['clusteredLexicons'] #feats = ['pos'] y_attribute = 'stance' X,y = self.fe.getFeaturesMatrix('train',feats,y_attribute) print (X.shape) Xt,yt = self.fe.getFeaturesMatrix('test',feats,y_attribute) #clf = LinearSVC(C=0.01,penalty='l1',dual=False) #clf = clf.fit(X,y) #y_pred = clf.predict(Xt) # f = open('pred','w') # for i in y_pred: # #print type(i) # f.write('{0}'.format(i)) # f.close() #print clf.score(Xt, yt) #pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(y_pred))) m2_xgb = xgb.XGBClassifier(n_estimators=10, nthread=-1, max_depth = 2 , seed=500) #m2_xgb = GridSearchCV(m2_xgb, self.getGridSearchParamsForXGBoost()) print "Run Model" y_pred = self.runXGBoostModel(m2_xgb, "m2_xgb_OS_ENN", X, y, Xt, True) # print type(yt) # print type(y_pred) # print len(yt) # print len(y_pred) # print yt.shape # print y_pred.shape # print yt # print y_pred # print(m2_xgb) print accuracy_score(yt, y_pred) def buildModel3(self): #feats = [['words2vec'],['pos'],['clusteredLexicons']] feats = [['words2vec'],['pos'],['clusteredLexicons']] y_attribute = 'stance' y_pred = [] y_t = [] for f in feats: X,y = self.fe.getFeaturesMatrix('train',f,y_attribute) Xt,yt = self.fe.getFeaturesMatrix('test',f,y_attribute) clf = SVC(C=1, probability=True) clf = clf.fit(X,y) train_transform = clf.predict_log_proba(X) test_transform = clf.predict_log_proba(Xt) # print 'Train transform ',train_transform.shape # print 'Test transform ',test_transform.shape y_pred.append(train_transform) y_t.append(test_transform) #y_pred_h = np.hstack(tuple(y_pred)) #y_t_h = np.hstack(tuple(y_t)) x = 0 for i in y_pred: x += i y_pred_h = x x = 0 for i in y_t: x += i y_t_h = x # print type(y_pred_h) # print y_pred_h[0] # print y_pred_h.shape regr = linear_model.LogisticRegression() regr.fit(y_pred_h, y) final_pred = regr.predict(y_t_h) print accuracy_score(final_pred, yt) pprint(self.eval.computeFscores(self.data.testTweets, self.fe.labelenc.inverse_transform(final_pred)))