def translate_maxibatch(maxibatch, num_to_target, num_prev_translated, mask=0): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: minibatches, idxs = util.read_all_lines(config, maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for x in minibatches: y_dummy = numpy.zeros(shape=(len(x), 1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors, maxlen=None) sample = translate_batch(session, sampler, x, x_mask, max_translation_len, normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] # print(best_hypo) eos_idx = list(best_hypo).index(0) if 0 in best_hypo else len( best_hypo) best_hypo = best_hypo[:eos_idx] best_hypo = best_hypo[:len(best_hypo) - mask] if len(best_hypo) > mask else [] best_hypo = list(best_hypo) + [0] # print(best_hypo) line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def main(): # prepare data train, valid = util.prepare_data() # get model model = get_model() model.summary() # generators for training and validation BATCH = 128 train_gen = util.next_train_batch(train, BATCH) valid_gen = util.next_valid_batch(valid, BATCH) # training EPOCHS = 5 TRAINS = 20480 VALIDS = 4096 model.compile(optimizer=Adam(1e-2), loss="mse") history = model.fit_generator(train_gen, samples_per_epoch=TRAINS, nb_epoch=EPOCHS, validation_data=valid_gen, nb_val_samples=VALIDS, verbose=1) # save model, weights model.save_weights('model.h5') with open('model.json', 'w') as f: f.write(model.to_json())
def _translate(self, process_id, input_item, get_sampler, sess): """ Actual translation (model sampling). """ # unpack input item attributes k = input_item.k x = input_item.batch alpha = input_item.normalization_alpha #max_ratio = input_item.max_ratio y_dummy = numpy.zeros(shape=(len(x),1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, self._options[0].factors, maxlen=None) sample = translate_utils.translate_batch( session=sess, sampler=get_sampler(k), x=x, x_mask=x_mask, max_translation_len=self._options[0].translation_maxlen, normalization_alpha=alpha) return sample
def minimal_steiner_vary_with_keywords_num(times=20): ''' 测试minimal steiner tree算法在查找不同个数的关键词下返回的查询结果个数和运行时间的差别 :return: ''' graph, categories, category_list = prepare_data() count_options = [2, 3, 4, 5, 6] num_of_options = len(count_options) num_nodes = np.zeros((num_of_options, times)) costs = np.zeros((num_of_options, times)) dict = {} for (i, count) in enumerate(count_options): for j in range(times): # keywords = generateNeighborKeywords(graph, categories, count, i*10 + j ) #seed = j keywords = generateKeywords(categories, count, i * 100 + j) num_nodes[i, j], costs[i, j] = testWithMinimalSteinerTree( graph, categories, keywords) if (j + 1) % 10 == 0: print('>') else: print('>', end='') print('==================%dkeyword finshed===============' % count) dict['num_nodes'] = num_nodes.tolist() dict['costs'] = costs.tolist() with open( '../outputs/minmal_steiner_vary_with_random_neighbor_keywords.json', 'w') as f: json.dump(dict, f)
def calc_loss_per_sentence(config, sess, text_iterator, model, normalization_alpha=0): losses = [] loss_per_sentence = model.get_loss() for x_v, y_v in text_iterator: if len(x_v[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ({0}), and number in validation corpus ({1})\n'.format(config.factors, len(x_v[0][0]))) sys.exit(1) x, x_mask, y, y_mask = util.prepare_data(x_v, y_v, config.factors, maxlen=None) feeds = {model.inputs.x: x, model.inputs.x_mask: x_mask, model.inputs.y: y, model.inputs.y_mask: y_mask, model.inputs.training: False} loss_per_sentence_out = sess.run(loss_per_sentence, feed_dict=feeds) # normalize scores according to output length if normalization_alpha: adjusted_lengths = numpy.array([numpy.count_nonzero(s) ** normalization_alpha for s in y_v_mask_in.T]) loss_per_sentence_out /= adjusted_lengths losses += list(loss_per_sentence_out) logging.info( "Seen {0}".format(len(losses))) return losses
def main(): torch.manual_seed(0) # データ生成 w_true = torch.tensor([1, 2, 3], dtype=torch.float) N = 100 X, y = prepare_data(N, w_true) # 重みの初期化 requires_grad=Trueにすると計算グラフ保持→逆伝播の計算ができる w = torch.randn(w_true.size(0), requires_grad=True) # 学習におけるハイパーパラメータ learning_rate = 0.1 num_epochs = 20 loss_list = [] for epoch in range(1, num_epochs + 1): w.grad = None y_pred = torch.mv(X, w) loss = torch.mean((y_pred - y)**2) loss.backward() loss_list.append(loss) # print(w.grad) w.data = w - learning_rate * w.grad.data print( f'Epoch{epoch}: loss={loss.item():.4f} w={w.data} dL/dw={w.grad.data}' ) plot_loss(loss_list)
def compare_vary_with_keywords_num(times=50): ''' 测试minimal steiner tree算法在查找不同个数的关键词下返回的查询结果个数和运行时间的差别 :return: ''' graph, categories, category_list = prepare_data() count_options = [2, 3, 4, 5, 6] num_of_options = len(count_options) num_nodes = np.zeros((3, num_of_options, times)) costs = np.zeros((3, num_of_options, times)) dict = {} for (i, count) in enumerate(count_options): for j in range(times): keywords = generateKeywords(category_list, count, i * 10 + j) #seed = j num_nodes[0, i, j], costs[0, i, j] = testWithMinimalSteinerTree( graph, categories, keywords) num_nodes[1, i, j], costs[1, i, j] = testWithRandomSteinerTree( graph, categories, keywords) num_nodes[2, i, j], costs[2, i, j] = testWithGreedySteinerTree( graph, categories, keywords) if (j + 1) % 10 == 0: print('>') else: print('>', end='') print('==================%dkeyword finshed===============' % count) print(np.average(num_nodes, axis=2)) print(np.average(costs, axis=2))
def main(): torch.manual_seed(0) # データ生成 w_true = torch.tensor([1, 2, 3], dtype=torch.float) N = 100 X, y = prepare_data(N, w_true) # モデル構築 model = nn.Linear(in_features=3, out_features=1, bias=False) criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.1) # print(list(model.parameters())) # 重みは勝手に設定してくれる num_epochs = 10 loss_list = [] for epoch in range(1, num_epochs + 1): optimizer.zero_grad() y_pred = model(X) loss = criterion(y_pred.view_as(y), y) loss.backward() loss_list.append(loss.item()) optimizer.step() # plt.plot(y) # plt.plot(model(X).detach().numpy()) plot_loss(loss_list)
def calc_cross_entropy_per_sentence(session, model, config, text_iterator, normalization_alpha=0.0): """Calculates cross entropy values for a parallel corpus. By default (when normalization_alpha is 0.0), the sentence-level cross entropy is calculated. If normalization_alpha is 1.0 then the per-token cross entropy is calculated. Other values of normalization_alpha may be useful if the cross entropy value will be used as a score for selecting between translation candidates (e.g. in reranking an n-nbest list). Using a different (empirically determined) alpha value can help correct a model bias toward too-short / too-long sentences. TODO Support for multiple GPUs Args: session: TensorFlow session. model: a RNNModel object. config: model config. text_iterator: TextIterator. normalization_alpha: length normalization hyperparameter. Returns: A pair of lists. The first contains the (possibly normalized) cross entropy value for each sentence pair. The second contains the target-side token count for each pair (including the terminating <EOS> symbol). """ ce_vals, token_counts = [], [] for xx, yy in text_iterator: if len(xx[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ' \ '({0}) and number present in data ({1})'.format( config.factors, len(xx[0][0]))) sys.exit(1) x, x_mask, y, y_mask = util.prepare_data(xx, yy, config.factors, maxlen=None) # Run the minibatch through the model to get the sentence-level cross # entropy values. feeds = {model.inputs.x: x, model.inputs.x_mask: x_mask, model.inputs.y: y, model.inputs.y_mask: y_mask, model.inputs.training: False} batch_ce_vals = session.run(model.loss_per_sentence, feed_dict=feeds) # Optionally, do length normalization. batch_token_counts = [numpy.count_nonzero(s) for s in y_mask.T] if normalization_alpha: adjusted_lens = [n**normalization_alpha for n in batch_token_counts] batch_ce_vals /= numpy.array(adjusted_lens) ce_vals += list(batch_ce_vals) token_counts += batch_token_counts logging.info("Seen {}".format(len(ce_vals))) assert len(ce_vals) == len(token_counts) return ce_vals, token_counts
def calc_cross_entropy_per_sentence(session, model, config, text_iterator, normalization_alpha=0.0): """Calculates cross entropy values for a parallel corpus. By default (when normalization_alpha is 0.0), the sentence-level cross entropy is calculated. If normalization_alpha is 1.0 then the per-token cross entropy is calculated. Other values of normalization_alpha may be useful if the cross entropy value will be used as a score for selecting between translation candidates (e.g. in reranking an n-nbest list). Using a different (empirically determined) alpha value can help correct a model bias toward too-short / too-long sentences. TODO Support for multiple GPUs Args: session: TensorFlow session. model: a RNNModel object. config: model config. text_iterator: TextIterator. normalization_alpha: length normalization hyperparameter. Returns: A pair of lists. The first contains the (possibly normalized) cross entropy value for each sentence pair. The second contains the target-side token count for each pair (including the terminating <EOS> symbol). """ ce_vals, token_counts = [], [] for xx, yy in text_iterator: if len(xx[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ' \ '({0}) and number present in data ({1})'.format( config.factors, len(xx[0][0]))) sys.exit(1) x, x_mask, y, y_mask = util.prepare_data(xx, yy, config.factors, maxlen=None) # Run the minibatch through the model to get the sentence-level cross # entropy values. feeds = {model.inputs.x: x, model.inputs.x_mask: x_mask, model.inputs.y: y, model.inputs.y_mask: y_mask, model.inputs.training: False} batch_ce_vals = session.run(model.loss_per_sentence, feed_dict=feeds) # Optionally, do length normalization. batch_token_counts = [numpy.count_nonzero(s) for s in y_mask.T] if normalization_alpha: adjusted_lens = [n**normalization_alpha for n in batch_token_counts] batch_ce_vals /= numpy.array(adjusted_lens) ce_vals += list(batch_ce_vals) token_counts += batch_token_counts logging.info("Seen {}".format(len(ce_vals))) assert len(ce_vals) == len(token_counts) return ce_vals, token_counts
def reuse_encoders(): counter = 0 train_x, train_y, test_x, test_y = util.prepare_data() for k, f in encoder_funcs.items(): print('Start reusing the ' + k + ' autoencoder!') # train the model train_and_evaluate(k, train_x, train_y, test_x, test_y) sys.exit(0)
def translate_maxibatch(maxibatch, model_set, num_to_target, num_prev_translated): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. model_set: an InferenceModelSet object. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: pre_minibatches, minibatches, idxs = util.read_all_lines( configs[0], maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for px, x in zip(pre_minibatches, minibatches): y_dummy = numpy.zeros(shape=(len(x), 1)) px, x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors, px, maxlen=None) sample = model_set.decode(session=session, px=px, x=x, x_mask=x_mask, beam_size=beam_size, normalization_alpha=normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def forward(self, data): data = prepare_data(data) full_p_states, p_mask, full_q_states, q_mask = self.encode(data) logits1, logits2, has_log = self.decode(full_p_states, p_mask, full_q_states, q_mask) loss = compute_loss(logits1, logits2, has_log, data['y1'], data['y2'], data['has_ans']) self.train_loss.update(loss.data.item()) del full_p_states, p_mask, full_q_states, q_mask, logits1, logits2, has_log return loss
def SelfEvaluate(self, batches, eval_file=None, answer_file=None, drop_file=None, dev=None): print('Starting evaluation') with open(eval_file, 'r', encoding='utf-8') as f: eval_file = json.load(f) with open(dev, 'r', encoding='utf-8') as f: dev = json.load(f) answer_dict = {} mapped_dict = {} for batch in batches: data = prepare_data(batch) full_p_states, p_mask, full_q_states, q_mask = self.encode(data) logits1, logits2, ans_log = self.decode(full_p_states, p_mask, full_q_states, q_mask) y1, y2, has_ans = get_predictions(logits1, logits2, ans_log) qa_id = data['id'] answer_dict_, mapped_dict_ = convert_tokens( eval_file, qa_id, y1, y2, has_ans) answer_dict.update(answer_dict_) mapped_dict.update(mapped_dict_) del full_p_states, p_mask, full_q_states, q_mask, y1, y2, answer_dict_, mapped_dict_, has_ans, ans_log, logits1, logits2 with open(drop_file, 'r', encoding='utf-8') as f: drop = json.load(f) for i in drop['drop_ids']: uuid = eval_file[str(i)]["uuid"] answer_dict[str(i)] = '' mapped_dict[uuid] = '' with open(answer_file, 'w', encoding='utf-8') as f: json.dump(mapped_dict, f) metrics = evaluate(dev, mapped_dict) # sub_path = join('./result/', "submit.csv") # #log.info('Writing submission file to {}...'.format(sub_path)) # with open(sub_path, 'w') as csv_fh: # csv_writer = csv.writer(csv_fh, delimiter=',') # csv_writer.writerow(['Id', 'Predicted']) # for uuid in sorted(mapped_dict): # csv_writer.writerow([uuid, mapped_dict[uuid]]) print("EM: {}, F1: {}, Has answer: {}, No answer: {}".format( metrics['exact'], metrics['f1'], metrics['HasAns_f1'], metrics['NoAns_f1'])) return metrics['exact'], metrics['f1']
def translate_maxibatch(maxibatch, model_set, num_to_target, num_prev_translated): """Translates an individual maxibatch. Args: maxibatch: a list of sentences. model_set: an InferenceModelSet object. num_to_target: dictionary mapping target vocabulary IDs to strings. num_prev_translated: the number of previously translated sentences. """ # Sort the maxibatch by length and split into minibatches. try: minibatches, idxs = util.read_all_lines(configs[0], maxibatch, minibatch_size) except exception.Error as x: logging.error(x.msg) sys.exit(1) # Translate the minibatches and store the resulting beam (i.e. # translations and scores) for each sentence. beams = [] for x in minibatches: y_dummy = numpy.zeros(shape=(len(x),1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors, maxlen=None) sample = model_set.beam_search( session=session, x=x, x_mask=x_mask, beam_size=beam_size, normalization_alpha=normalization_alpha) beams.extend(sample) num_translated = num_prev_translated + len(beams) logging.info('Translated {} sents'.format(num_translated)) # Put beams into the same order as the input maxibatch. tmp = numpy.array(beams, dtype=numpy.object) ordered_beams = tmp[idxs.argsort()] # Write the translations to the output file. for i, beam in enumerate(ordered_beams): if nbest: num = num_prev_translated + i for sent, cost in beam: translation = util.seq2words(sent, num_to_target) line = "{} ||| {} ||| {}\n".format(num, translation, str(cost)) output_file.write(line) else: best_hypo, cost = beam[0] line = util.seq2words(best_hypo, num_to_target) + '\n' output_file.write(line)
def InitNet(self, netParam): netscheme = empty() netscheme.train_net, netscheme.test_net = net.NetSolve(netParam) message = "Training Network:" + "\n" message += str(netscheme.train_net) self.logger.info(message) message = "Testing Network:" + "\n" message += str(netscheme.test_net) self.logger.info(message) self.cnf.netscheme = netscheme self.ff = util.prepare_cnf(self.cnf,self.logger) self.inputs, self.targets, self.test = util.prepare_data(self.cnf,self.logger)
def translate_worker(in_queue, out_queue, model, sess, config): while True: job = in_queue.get() if job is None: break idx, x = job y_dummy = numpy.zeros(shape=(len(x),1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors, maxlen=None) try: samples = model.beam_search(sess, x, x_mask, config.beam_size) out_queue.put((idx, samples)) except: in_queue.put(job)
def _translate(self, process_id, input_item, models, sess): """ Actual translation (model sampling). """ # unpack input item attributes k = input_item.k x = input_item.batch #max_ratio = input_item.max_ratio y_dummy = numpy.zeros(shape=(len(x), 1)) x, x_mask, _, _ = prepare_data(x, y_dummy, maxlen=None) sample = inference.beam_search(models, sess, x, x_mask, k) return sample
def _translate(self, process_id, input_item, ensemble, sess): """ Actual translation (model sampling). """ # unpack input item attributes k = input_item.k x = input_item.batch #max_ratio = input_item.max_ratio y_dummy = numpy.zeros(shape=(len(x), 1)) x, x_mask, _, _ = util.prepare_data(x, y_dummy, self._options[0].factors, maxlen=None) sample = ensemble.beam_search(sess, x, x_mask, k) return sample
def compare_head_tail_keyword_in_mashup(sample_count=50): ''' 以mashup中调用的api的类别(每个api取一个category)为关键词,测试返回的节点数和时间 对一个每一组mashup中的api类别,我们采用两种形式检索,如要检索的关键词为ABCD,我们会分别检索ABCD和AD两种形式的关键词 查看其返回的节点数和花费时间 因为apiGraph就是按mashup来的,因此返回的节点数应该小于或等于mashup中调用的api的个数 :param sample_count: :return: ''' sample_dict = mashup.sample_categories_of_mashup(sample_count) graph, categories, _ = prepare_data() count_options = [2, 3, 4, 5, 6] num_of_options = len(count_options) num_nodes = np.zeros((3, num_of_options, sample_count)) costs = np.zeros((3, num_of_options, sample_count)) for (i, count) in enumerate(count_options): keywords_array = sample_dict[count] for j in range(sample_count): keywords = [keywords_array[j][0], keywords_array[j][-1]] num_nodes[0, i, j], costs[0, i, j] = testWithMinimalSteinerTree( graph, categories, keywords) num_nodes[1, i, j], costs[1, i, j] = testWithRandomSteinerTree( graph, categories, keywords) num_nodes[2, i, j], costs[2, i, j] = testWithGreedySteinerTree( graph, categories, keywords) if (j + 1) % 10 == 0: print('>') else: print('>', end='') print('==================%dkeyword finshed===============' % count) dict = {} dict['num_nodes'] = num_nodes.tolist() dict['costs'] = costs.tolist() with open('../outputs/compare_head_tail_keywords_in_mashup.json', 'w') as f: json.dump(dict, f)
from util import prepare_data, submit from models.lgb import get_lgb_predictions from models.nn import get_nn_predictions if __name__ == "__main__": train_gal, train_exgal, test_gal, test_exgal, gal_class_list, exgal_class_list, test_df = prepare_data( ) lgb_oof_gal, lgb_oof_exgal, lgb_test_gal, lgb_test_exgal = get_lgb_predictions( train_gal, train_exgal, test_gal, test_exgal) lgb_gal_preds = [] for i in range(lgb_oof_gal.shape[1]): lgb_gal_preds.append("lgb_pred" + str(i)) train_gal["lgb_pred" + str(i)] = lgb_oof_gal[:, i] test_gal["lgb_pred" + str(i)] = lgb_test_gal[:, i] lgb_exgal_preds = [] for i in range(lgb_oof_exgal.shape[1]): lgb_exgal_preds.append("lgb_pred" + str(i)) train_exgal["lgb_pred" + str(i)] = lgb_oof_exgal[:, i] test_exgal["lgb_pred" + str(i)] = lgb_test_exgal[:, i] oof_preds_gal, oof_preds_exgal, test_preds_gal, test_preds_exgal = get_nn_predictions( train_gal, train_exgal, test_gal, test_exgal) submit(test_df, test_preds_gal, test_preds_exgal, gal_class_list, exgal_class_list, "submissions/stacking.csv")
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" logging.info('Building model...') model = StandardModel(config) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.summaryFreq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, model, optimizer, global_step, writer) saver, progress = init_or_restore_variables(config, sess, train=True) global_step.load(progress.uidx, sess) #save model options config_as_dict = OrderedDict(sorted(vars(config).items())) json.dump(config_as_dict, open('%s.json' % config.saveto, 'wb'), indent=2) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in xrange(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: print("") print("") print("") print("########## Source Sents ############") print(source_sents) print("") print("") print("") print("########## Target Sents ############") print(target_sents) if len(source_sents[0][0]) != config.factors: logging.error( 'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summaryFreq and ( (progress.uidx % config.summaryFreq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.dispFreq and progress.uidx % config.dispFreq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sampleFreq and progress.uidx % config.sampleFreq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beamFreq and progress.uidx % config.beamFreq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model.beam_search(sess, x_small, x_mask_small, config.beam_size) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.validFreq and progress.uidx % config.validFreq == 0: costs = validate(config, sess, valid_text_iterator, model) # validation loss is mean of normalized sentence log probs valid_loss = sum(costs) / len(costs) if (len(progress.history_errs) == 0 or valid_loss < min(progress.history_errs)): progress.history_errs.append(valid_loss) progress.bad_counter = 0 saver.save(sess, save_path=config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_loss) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, model, config, valid_text_iterator) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: save_path = config.saveto + ".best-valid-script" saver.save(sess, save_path=save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.saveFreq and progress.uidx % config.saveFreq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def player_list(): entries = prepare_data() players = get_player_info(entries) return json.dumps({'players': players})
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(tf_utils.get_available_gpus()) num_replicas = max(1, num_gpus) if config.loss_function == 'MRT': assert config.gradient_aggregation_steps == 1 assert config.max_sentences_per_device == 0, "MRT mode does not support sentence-based split" if config.max_tokens_per_device != 0: assert (config.samplesN * config.maxlen <= config.max_tokens_per_device), "need to make sure candidates of a sentence could be " \ "feed into the model" else: assert num_replicas == 1, "MRT mode does not support sentence-based split" assert (config.samplesN * config.maxlen <= config.token_batch_size), "need to make sure candidates of a sentence could be " \ "feed into the model" logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = learning_schedule.ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = learning_schedule.TransformerSchedule( global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) elif config.learning_schedule == "warmup-plateau-decay": schedule = learning_schedule.WarmupPlateauDecaySchedule( global_step=global_step, peak_learning_rate=config.learning_rate, warmup_steps=config.warmup_steps, plateau_steps=config.plateau_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format(config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) if config.exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(config.exponential_smoothing) saver, progress = model_loader.init_or_restore_variables( config, sess, train=True) global_step.load(progress.uidx, sess) if config.sample_freq: random_sampler = RandomSampler( models=[replicas[0]], configs=[config], beam_size=1) if config.beam_freq or config.valid_script is not None: beam_search_sampler = BeamSearchSampler( models=[replicas[0]], configs=[config], beam_size=config.beam_size) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) # set epoch = 1 if print per-token-probability if config.print_per_token_pro: config.max_epochs = progress.eidx+1 for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape output = updater.update( sess, x_in, x_mask_in, y_in, y_mask_in, num_to_target, write_summary_for_this_batch) if config.print_per_token_pro == False: total_loss += output else: # write per-token probability into the file f = open(config.print_per_token_pro, 'a') for pro in output: pro = str(pro) + '\n' f.write(pro) f.close() n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 # Update the smoothed version of the model variables. # To reduce the performance overhead, we only do this once every # N steps (the smoothing factor is adjusted accordingly). if config.exponential_smoothing > 0.0 and progress.uidx % smoothing.update_frequency == 0: sess.run(fetches=smoothing.update_ops) if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:, :10] samples = translate_utils.translate_batch( sess, random_sampler, x_small, x_mask_small, config.translation_maxlen, 0.0) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss[0][0], num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:,:10] samples = translate_utils.translate_batch( sess, beam_search_sampler, x_small, x_mask_small, config.translation_maxlen, config.normalization_alpha) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost/len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) valid_ce = validate(sess, replicas[0], config, valid_text_iterator) sess.run(fetches=smoothing.swap_ops) else: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) score = validate_with_script(sess, beam_search_sampler) sess.run(fetches=smoothing.swap_ops) else: score = validate_with_script(sess, beam_search_sampler) need_to_save = (score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop=True progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
# using gredient to update model parameters optimizer.step() iters += 1 if iters % 300 == 0: for test_val, test_target in test_loader: test_outputs = Mymodel(test_val) loss2 = loss_function(test_outputs, test_target) print('Iteration: {}. TrainLoss: {}. TestLoss: {}'.format( iters, loss.item(), loss2.item())) torch.save( Mymodel.state_dict(), 'Trained_model/trained_model_' + str(iters) + '.pkl') plt.plot(hisloss) plt.xlabel('Iteration') plt.ylabel('Training loss') plt.title('Traing process') plt.grid(True) plt.savefig('Trained_model/loss.png') return Mymodel def prediction(Mymodel, seq): return if __name__ == "__main__": data = get_ts_dxy(1) train, test = prepare_data(data, 8) Mymodel = training_model(train, test, num_epochs=10)
def train(in_args): """Starts the training process of the neural network. """ ##Peparing data train_loader, test_loader, validation_loader, number_of_classes, data_labels = util.prepare_data( in_args.data_directory) #Preparing Model, Criterion and Optimizer if (in_args.checkpoint): model, data_labels, criterion, optimizer = load_checkpoint( in_args.checkpoint, in_args.gpu, True) else: model = create_model(gpu=in_args.gpu, arch=in_args.arch, hidden_layer_size=in_args.hidden_units, output_size=number_of_classes) criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=in_args.learning_rate) if (model): device = 'cuda' if in_args.gpu else 'cpu' running_loss = 0 for epoch in range(in_args.epochs): steps = 0 for inputs, labels in train_loader: steps += 1 inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() output = model(inputs) loss = criterion(output, labels) loss.backward() optimizer.step() running_loss += loss.item() if steps % 5 == 0: test_loss = 0 accuracy = 0 model.eval() with torch.no_grad(): for inputs, labels in test_loader: inputs, labels = inputs.to(device), labels.to( device) output = model(inputs) batch_loss = criterion(output, labels) test_loss += batch_loss.item() probability = torch.exp(output) top_p, top_class = probability.topk(1, dim=1) equals = top_class == labels.view(*top_class.shape) accuracy += torch.mean( equals.type(torch.FloatTensor)).item() print("---------") print("Epoch {}/{}. Step {}.".format( (epoch + 1), in_args.epochs, steps)) print("Train loss: {}".format("%.3f" % (running_loss / 5))) print("Test loss: {}".format( "%.3f" % (test_loss / len(test_loader)))) print("Test accuracy: {}".format( "%.3f" % (accuracy / len(test_loader)))) print("---------") running_loss = 0 model.train() save_directory = in_args.save_dir print( "Network training finished.\nPlease wait while progress is being saved to file {}." .format(save_directory)) checkpoint = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'hidden_layer_size': in_args.hidden_units, 'output_size': number_of_classes, 'architecture': in_args.arch, 'data_labels': data_labels } torch.save(checkpoint, save_directory) print( "The neural network will perform a validation test. Please wait.") validation_loss = 0 accuracy = 0 model.eval() with torch.no_grad(): for inputs, labels in validation_loader: inputs, labels = inputs.to(device), labels.to(device) output = model(inputs) batch_loss = criterion(output, labels) validation_loss += batch_loss.item() probability = torch.exp(output) top_p, top_class = probability.topk(1, dim=1) equals = top_class == labels.view(*top_class.shape) accuracy += torch.mean(equals.type(torch.FloatTensor)).item() print("Validation loss: {}".format( "%.3f" % (validation_loss / len(validation_loader)))) print("Validation accuracy: {}".format( "%.3f" % (accuracy / len(validation_loader)))) print("Above results show expected performance during inference.")
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format(config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables( config, sess, train=True) global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :10], x_mask_in[:, :10], y_in[:, :10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len(y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :10], x_mask_in[:, :10], y_in[:,:10] samples = model_set.beam_search(sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len(y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost/len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = (score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop=True progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def training_2d(train, test): config = Config(sampling_rate=44100, audio_duration=2, n_folds=n_folds, learning_rate=0.001, use_mfcc=True, n_mfcc=40) if DEBUG: config = Config(sampling_rate=44100, audio_duration=2, n_folds=n_folds, max_epochs=1, use_mfcc=True, n_mfcc=40) X_train = prepare_data(train, config, '../data/audio_train/') X_test = prepare_data(test, config, '../data/audio_test/') y_train = to_categorical(train.label_idx, num_classes=config.n_classes) mean = np.mean(X_train, axis=0) std = np.std(X_train, axis=0) X_train = (X_train - mean)/std X_test = (X_test - mean)/std PREDICTION_FOLDER = "predictions_2d_conv" if not os.path.exists(PREDICTION_FOLDER): os.mkdir(PREDICTION_FOLDER) if os.path.exists('logs/' + PREDICTION_FOLDER): shutil.rmtree('logs/' + PREDICTION_FOLDER) skf = StratifiedKFold(train.label_idx, n_folds=config.n_folds) for i, (train_split, val_split) in enumerate(skf): K.clear_session() X, y, X_val, y_val = X_train[train_split], y_train[train_split], X_train[val_split], y_train[val_split] checkpoint = ModelCheckpoint('../model/best2d_%d.h5'%i, monitor='val_acc', verbose=1, save_best_only=True) early = EarlyStopping(monitor="val_loss", mode="min", patience=5) tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%i'%i, write_graph=True) callbacks_list = [checkpoint, early, tb] print("#"*50) print("Fold: ", i) model = get_2d_conv_model_advance(config) history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, batch_size=64, epochs=config.max_epochs) model.load_weights('../model/best2d_%d.h5'%i) # Save train predictions #predictions = model.predict(X_train, batch_size=64, verbose=1) #np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions) # Save test predictions predictions = model.predict(X_test, batch_size=64, verbose=1) np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions) K.clear_session() pred_list = [] for i in range(n_folds): pred_list.append(np.load("predictions_2d_conv/test_predictions_%d.npy"%i)) prediction = np.ones_like(pred_list[0]) for pred in pred_list: prediction = prediction*pred prediction = prediction**(1./len(pred_list)) # Make a submission file top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]] predicted_labels = [' '.join(list(x)) for x in top_3] test = pd.read_csv('../data/sample_submission.csv') test['label'] = predicted_labels test[['fname', 'label']].to_csv("../result/2d_conv.csv", index=False) if not DEBUG: command = '/home/kownse/anaconda3/bin/kaggle competitions submit -c freesound-audio-tagging -f ../result/2d_conv.csv -m "submit"'.format(competition, file_7z) os.system(command)
def train(config, sess): #################################################### assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) ############################################################ #add: pretrain if config.pretrain: logging.info("Start pre-training") #预训练网络参数 pre_batch_size = 1000 epochs = 20 pre_learning_rate = 0.001 pre_optimizer = tf.train.GradientDescentOptimizer( pre_learning_rate).minimize(replicas[0].loss_pre_train) #加载预训练数据及相关字典 gvocab, gvectors = util.pre_load_data(config.pretrain_vocab, config.pretrain_vectors) pre_vocab_list = list(gvocab.keys()) #过采样 pre_train_list = [] with open('/media/ntfs-3/EXP/MULTI/mix/zh-en/data3/glove/vocab.txt', 'r', encoding='utf-8') as f: for line in f: k, v = line.strip().split() pre_train_list.extend([k] * int(v)) utf8_dict = json.load( open(config.source_dicts[0], 'r', encoding='utf-8')) embedding_list = [] #开始训练 for i in range(epochs): logging.info("epoch:{}".format(i)) if i == epochs - 1: source_x, source_y, _vocab = util.get_data(pre_vocab_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=False) else: source_x, source_y, _vocab = util.get_data(pre_train_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=True) for idx, [s_x, s_y] in enumerate(zip(source_x, source_y)): assert len(s_x) == len(s_y), "{}, {}".format( len(s_x), len(s_y)) sx, sy = util.pre_prepare_data(s_x, s_y) feed_dict = {} feed_dict[replicas[0].pre_inputs.x] = sx feed_dict[replicas[0].pre_inputs.y] = sy _, loss, embedding = sess.run([ pre_optimizer, replicas[0].loss_pre_train, replicas[0].pre_embedding ], feed_dict=feed_dict) if idx % 100 == 0: logging.info("loss:{}".format(loss)) if i == epochs - 1: embedding_list.append(embedding) assert _vocab == pre_vocab_list emb = embedding_list[0] for e in embedding_list[1:]: emb = numpy.concatenate((emb, e)) numpy.save("pre_emb/pre_emb.npy", emb) with open("pre_emb/vocab", "w", encoding="utf-8") as f: f.write("\n".join(pre_vocab_list)) #tsne可视化 tsne = util.get_tsne(emb, "pre_emb/tsne.npy") gtsne = numpy.load(config.pretrain_tsne) #util.plot_tsne(_vocab, tsne, gvocab, gtsne, top=20) #exit(0) ################################################################################## global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for pre_source_sents, source_sents, target_sents in text_iterator: #if len(source_sents[0][0]) != config.factors: #logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) #sys.exit(1) px_in, x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, pre_source_sents, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, uLen, batch_size) = x_in.shape loss = updater.update(sess, px_in, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) #logging.info('SOURCE: {}'.format(source)) #logging.info('SOURCE: {}'.format(xx)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) #logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error( 'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def max_sim(simulation, arguments, group): dataset = [simulation(*arguments) for i in range(NUM_SIM)] data = prepare_data(dataset, group) _, _, _, _, _, m = merge_curves(data, len(dataset), 1) return m
from util import get_data df = get_data() from util import prepare_data cat_df_list = list(df.select_dtypes(include=['object'])) num_df_list = list(df.select_dtypes(include=['float64', 'int64'])) X = df[num_df_list] range_n_clusters = [2, 3, 4, 5, 6] range_n_clusters = [4, 5, 6] range_n_clusters = [2, 3] from sklearn.preprocessing import StandardScaler y = df["readmitted"] X = prepare_data(df) X.drop("readmitted", inplace=True, axis=1) scaler = StandardScaler() X = StandardScaler().fit_transform(X) from imblearn.under_sampling import (RandomUnderSampler, ClusterCentroids, TomekLinks, NeighbourhoodCleaningRule, NearMiss) sampler = NearMiss(n_jobs=2) X_rs, y_rs = sampler.fit_sample(X, y) from sklearn.manifold import TSNE transformer = TSNE(n_components=7) X_std = transformer.fit_transform(X_rs)
import numpy as np import helpers import config as cfg import matplotlib.pyplot as plt K.tensorflow_backend._get_available_gpus() # this needs to get generalized class_names_list, label_values = helpers.get_label_info( os.path.join("CamVid", "class_dict.csv")) num_classes = len(label_values) # Load the data print("Loading the data ...") train_input_names, train_output_names, val_input_names, val_output_names, test_input_names, test_output_names = util.prepare_data( cfg.DATASET_DIR) input_data = [] output_labels = [] val_data = [] val_labels = [] for img_name in train_input_names: input_image = util.load_image(img_name) with tf.device('/cpu:0'): input_image = np.float32(input_image) / 255.0 #input_data.append(np.expand_dims(input_image, axis=0)) input_data.append(input_image) print(img_name)