def __init__(self, source_vocab_file, target_vocab_file, sample_file, config, logger): self.logger = logger self.config = config self.sample_file = sample_file self.word2id_src, self.id2word_src = load_vocabulary(source_vocab_file) self.word2id_tar, self.id2word_tar = load_vocabulary(target_vocab_file) self.tf_record_file = os.path.join(self.config.tokenized_data_dir, 'sample.tf_record') self.pad_id_src = self.word2id_src['<pad>'] self.unk_id_src = self.word2id_src['<unk>'] self.pad_id_tar = self.word2id_tar['<pad>'] self.unk_id_tar = self.word2id_tar['<unk>']
def __load_embeddings(self): '''Loads the embeddings with the associated vocabulary and saves them for later usage in the DataLoader and while training/testing.''' vocabulary = None embeddings = None vocabulary = utils.load_vocabulary(self.cfg.get('vocabulary')) if self.cfg.get('w2v_embeddings'): embeddings = utils.load_w2v_embeddings( self.cfg.get('w2v_embeddings')) elif self.cfg.get('ft_embeddings'): embeddingsy = utils.load_ft_embeddings( self.cfg.get('ft_embeddings')) else: embeddings = np.random.uniform( -1.0, 1.0, size=(len(vocabulary), self.cfg.get('max_random_embeddings_size'))) # Prepare the vocabulary and embeddings (e.g. add embedding for unknown words) embeddings, vocabulary = utils.prepare_embeddings_and_vocabulary( embeddings, vocabulary) self.cfg.set('vocabulary_dict', vocabulary) self.cfg.set('embeddings_matrix', embeddings) # revert the vocabulary for the idx -> text usages self.rev_vocabulary = utils.reverse_vocabulary(vocabulary)
def __init__(self,vocab_file,sample_file,config,logger): self.logger=logger self.config=config self.sample_file=sample_file self.word2id,self.id2word=load_vocabulary(vocab_file) self.tf_record_file=os.path.join(self.config.tokenized_data_dir,'sample.tf_record') self.pad_id=self.word2id['<pad>'] self.unk_id=self.word2id['<unk>']
def __init__(self, parameters): """ Constructor. It loads the hyperparameters of the network Parameters --------- parameters: str File containing the parameters of the network """ self.parameters = self.load_parameters(parameters) self.session_conf = tf.ConfigProto( allow_soft_placement=self.parameters['allow_soft_placement'], log_device_placement=self.parameters['log_device_placement']) self.vocabulary_dict = load_vocabulary( os.path.join(os.path.dirname(os.path.realpath(__file__)), self.parameters['vocabulary'])) self.inverse_vocabulary_dict = load_vocabulary( os.path.join(os.path.dirname(os.path.realpath(__file__)), self.parameters['inverse_vocabulary']))
def main(_): vocab, dictionary = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'sentence_vocab')) tags_list, tags_dict = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'tag_vocab')) intent_list, intent_dict = load_vocabulary(os.path.join(FLAGS.vocab_dir, 'intent_vocab')) all_sentence = prepare_test_data(FLAGS.test_data_file, dictionary) model = RNNModel(hidden_size=FLAGS.hidden_size, embed_size=FLAGS.embedding_size, source_vocab_size=len(vocab), tag_vocab_size=len(tags_list), intent_vocab_size=len(intent_list)) all_tags = [] all_intent = [] with tf.Session(graph=model.graph) as sess: # saver = tf.train.import_meta_graph('{}.meta'.format(FLAGS.checkpoint_file)) model.saver.restore(sess, FLAGS.checkpoint_file) # graph = tf.get_default_graph() # input_x = graph.get_tensor_by_name('input_x:0') # input_len = graph.get_tensor_by_name('input_len:0') # keep_prob = graph.get_tensor_by_name('keep_prob:0') # output_tag = graph.get_tensor_by_name('output_tag:0') # output_intent = graph.get_tensor_by_name('output_intent:0') for sentence in all_sentence: predict_tags, predict_intent = sess.run([model.output_tag, model.output_intent], feed_dict={ model.input_x: [sentence], model.input_len: [len(sentence)], model.keep_prob: 1.0 }) all_tags.append(predict_tags[0]) all_intent.append(predict_intent[0]) all_tags = [['O'] + [tags_list[i] for i in tags] for tags in all_tags] all_intent = [intent_list[i] for i in all_intent] with open(FLAGS.output_tag_file, 'w') as f: f.write('\n'.join([' '.join(tags) for tags in all_tags])) with open(FLAGS.output_intent_file, 'w') as f: f.write('\n'.join(all_intent))
def translate_to_origin(model, text, language): text_matrix = utils.text_to_one_hot_matrix(text, language) probas = model.predict(text_matrix) probas = probas[0] indices = [] for proba in probas: ind = np.where(proba == np.amax(max(proba))) indices.append(ind[0][0]) char_to_index, index_to_char, vocab_size, trans_to_index, _, trans_vocab_size = utils.load_vocabulary( language) translated_text_list = [index_to_char[i] for i in indices] translated_text = ''.join(translated_text_list) return translated_text, translated_text_list
class Hyperparamters: # Train parameters num_train_epochs = 20 print_step = 1 batch_size = 8 summary_step = 10 num_saved_per_epoch = 3 max_to_keep = 100 logdir = 'logdir/model_01' file_save_model = 'model/model_01' # Predict model file file_model = 'model/saved_01' # Train/Test data data_dir = os.path.join(pwd, 'data') train_data = 'train_onehot.csv' test_data = 'test_onehot.csv' # Load vocabulcary dict dict_id2label, dict_label2id = load_vocabulary( os.path.join(pwd, 'data', 'vocabulary_label.txt')) label_vocabulary = list(dict_id2label.values()) # Optimization parameters warmup_proportion = 0.1 use_tpu = None do_lower_case = True learning_rate = 5e-5 # TextCNN parameters # num_filters = 128 # filter_sizes = [2,3,4,5,6,7] # embedding_size = 384 # keep_prob = 0.5 # Sequence and Label sequence_length = 60 num_labels = len(list(dict_id2label)) # ALBERT model = 'albert_small_zh_google' bert_path = os.path.join(pwd, model) vocab_file = os.path.join(pwd, model, 'vocab_chinese.txt') init_checkpoint = os.path.join(pwd, model, 'albert_model.ckpt') saved_model_path = os.path.join(pwd, 'model')
def main(): args = cmdparser() config = get_config(args.config) if args.preprocess: utils.preprocess(config['raw_path'], config['train_path'], config['dev_path'], config['label_path'], config['stop_word_path'], config['vocabulary_path']) labels = utils.load_labels(config['label_path']) vocabulary = utils.load_vocabulary(config['vocabulary_path']) stop_words = utils.load_stop_words(config['stop_word_path']) if args.dev: train(config, vocabulary, labels, stop_words, save_path='', mode='dev') elif args.train: if int(config['ensemble_size']) == 1: train(config, vocabulary, labels, stop_words, save_path=config['model_path'], mode='train') else: for i in range(int(config['ensemble_size'])): train(config, vocabulary, labels, stop_words, save_path=config[f'model_path_{i+1}'], mode='train') elif args.test: if int(config['ensemble_size']) == 1: test(config, vocabulary, labels, stop_words, save_path=[config['model_path']]) else: test_paths = [ config[f'model_path_{i+1}'] for i in range(int(config['ensemble_size'])) ] test(config, vocabulary, labels, stop_words, save_path=test_paths)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--seq_len', default=40, type=int) parser.add_argument('--model', default=None) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--translit_path', default=None) parser.add_argument('--language', default=None) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language=args.language) (test_text, trans, long_letter_reverse_mapping) = utils.load_language_data( language=args.language, is_train=False) print("Building network ...") (output_layer, predict) = utils.define_model(args.hdim, args.depth, trans_vocab_size=trans_vocab_size, vocab_size=vocab_size, is_train=False) if args.model: f = np.load(args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Testing ...") if args.translit_path: data = codecs.open(args.translit_path, 'r', encoding='utf-8').read() translate_romanized(predict, data, args.seq_len, trans, trans_vocab_size, trans_to_index, index_to_char, long_letter_reverse_mapping) else: test(predict, test_text, args.language, args.model, args.seq_len, long_letter_reverse_mapping, trans, trans_to_index, char_to_index, index_to_trans, index_to_char)
from tqdm import tqdm from random_word import RandomWords from utils import load_vocabulary, guess_word vocabularies = { "small": load_vocabulary("american-english-small"), "normal": load_vocabulary("american-english"), "large": load_vocabulary("american-english-large"), "insane": load_vocabulary("american-english-insane"), "english_normal": load_vocabulary("english_words.txt"), "english_large": load_vocabulary("english_words_complete.txt"), } for key1, voc1 in vocabularies.items(): for key2, voc2 in vocabularies.items(): if key2 == key1: continue diff1 = set(voc1).difference(set(voc2)) diff2 = set(voc2).difference(set(voc1)) sum_ = set(voc1).union(set(voc2)) print() print(key1, " >>> ", len(voc1)) print(key2, " >>> ", len(voc2)) print(key1 + " - " + key2, " >>> ", len(diff1)) print(key2 + " - " + key1, " >>> ", len(diff2)) print(key1 + " + " + key2, " >>> ", len(sum_)) a = b for key, vocab in vocabularies.items():
help="entailment/neutral/contradiction") # data parser.add_argument("--word_emb_dim", type=int, default=300, help="word embedding dimension") """ CONFIGURATIONS """ config = parser.parse_args() """ DATA """ train, dev, test = ed.import_datasets(config.nli_path) word_vectors = ut.load_vocabulary(config.vocabulary_path) for sentence_type in ['premise', 'hypothesis']: for data_type in ['train', 'dev']: eval(data_type)[sentence_type] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vectors] + ['</s>'] for sent in eval(data_type)[sentence_type]]) """ MODEL """ # model configurations config_nli_model = { 'word_emb_dim': config.word_emb_dim, 'bilstm_dim': config.bilstm_dim, 'lstm_layers': config.lstm_layers,
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description=''' Train you model specifying parameters Use-cases: python train.py --depth=10 --seq_len=30 --data_size=5_000 --languages=hy-en,hy-ru,ru-en python train.py --depth=10 --seq_len=30 --data_size=5_000 --languages=hy,ru-en,en ''') parser.add_argument('--hdim', default=512, type=int, help='Dimension of hidden layers') parser.add_argument('--depth', default=2, type=int, help='Depth of network.') parser.add_argument('--batch_size', default=32, type=int, help='Batch size for learning.') parser.add_argument('--seq_len', default=100, type=int, help='Sequences size for splitting text for training.') parser.add_argument('--languages', default=None, required=True, help='Specify language to train.') # parser.add_argument('--grad_clip', default=100, type=int, help='') # parser.add_argument('--lr', default=0.01, type=float, help='') parser.add_argument('--epoch', default=10, type=int, help='Epochs of train.') # parser.add_argument('--model', default=None, help='') parser.add_argument('--prefix', default='m', help='Used for model name prefix.') # parser.add_argument('--start_from', default=0, type=float, help='') parser.add_argument( '--model_path', type=str, help= 'Specify model path to save, or will we saved under languages/<lang>models/model_name_prefix***' ) parser.add_argument( '--validate', type=bool, default=True, help='Evaluate percentage of validation data. Default:True') parser.add_argument( '--data_size', type=int, default=5_000_000, help='Split date size in chars: Set 0 to train all data.') args = parser.parse_args() languages = utils.parse_languages(args.languages) print("Languages to train: " + str(languages)) list_languages = [] for key, value in languages.items(): list_languages.append(key) if len(value) == 0: dirs = glob.glob('data_preprocessed/' + key + "/mapping_to_*") print(dirs) for dir in dirs: p = re.compile("mapping_to_(.*)") result = p.search(dir) list_languages.append(result.group(1)) else: list_languages.extend(value) list_languages = list(dict.fromkeys(list_languages)) list_languages.sort() print(list_languages) print("Loading Files") char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size = \ utils.load_vocabulary(list_languages) print("vocab size: ", vocab_size) print("trans vocab size: ", trans_vocab_size) train_text, train_translated_text = utils.load_preprocessed_data( languages, args.data_size, 'train') print("Train text size:", len(train_text)) print("Train translated text size:", len(train_translated_text)) print(char_to_index) print('а' in char_to_index) print(ord('а')) x_train = utils.generator_biniries(train_text, args.seq_len, char_to_index) return # shuffle train data train_text = train_text.split('։') random.shuffle(train_text) train_text = '։'.join(train_text) if args.data_size != 0: val_size = round(args.data_size / 0.7 * 0.3) print("Data splitted, train:", args.data_size, ", val:", val_size) train_text = train_text[:args.data_size] # 226_849_593 val_text = val_text[:val_size] #34722649 import utilsk print("Building Network ...") model = utilsk.define_model(args.hdim, args.depth, trans_vocab_size, vocab_size, is_train=True) print(model.summary()) print("Preparing data ...") before_fit_time = datetime.now() (x_train, y_train) = utils.data_generator(train_text, args.seq_len, trans, trans_to_index, char_to_index, is_train=True) print("Training ...") history = model.fit(x_train, y_train, validation_split=0.1, epochs=args.epoch, batch_size=args.batch_size) loss = history.history["loss"][-1] print(history.history) # save model model_file_path = utils.get_model_file_path(args, before_fit_time, loss) model.save_weights(model_file_path) print('Model saved:', model_file_path) print("Validate exact....") if args.validate: (x_test, y_test) = utils.data_generator(val_text, args.seq_len, trans, trans_to_index, char_to_index, is_train=True) score = model.evaluate(x_test, y_test, verbose=1) print("Evaluated on validation data", score) else: print("Validation disabled.") utils.save_acc_loss_results(args, history) utils.write_results_file(args, history, train_text, val_text)
def evaluate(config, model, data_iter, test=False): model.eval() loss_intent_total = 0 loss_slot_total = 0 predict_slot_all = np.array([], dtype=int) predict_intent_all = np.array([], dtype=int) labels_slot_all = np.array([], dtype=int) labels_intent_all = np.array([], dtype=int) with torch.no_grad(): i = 0 for texts, labels, slot in data_iter: # print(i) if texts[0].shape[0] == 0 or labels.shape[0] == 0: continue outputs = model(texts) slot_outputs = outputs[0] intent_outputs = outputs[1] slot = slot.view(-1) # loss_intent = F.multi_margin_loss(intent_outputs, labels) intent_1 = torch.max(intent_outputs, dim=-1, keepdim=False)[0].cuda() loss_intent = F.cross_entropy(intent_1, labels) loss_slot = F.cross_entropy(slot_outputs, slot) loss_slot_total += loss_slot loss_intent_total += loss_intent labels = labels.data.cpu().numpy() slot = slot.data.cpu().numpy() predict_intent = torch.max(intent_1.data, 1)[1].cpu() predict_slot = torch.max(slot_outputs.data, 1)[1].cpu() labels_intent_all = np.append(labels_intent_all, labels) labels_slot_all = np.append(labels_slot_all, slot) predict_intent_all = np.append(predict_intent_all, predict_intent) predict_slot_all = np.append(predict_slot_all, predict_slot) i += 1 acc_intent = metrics.accuracy_score(labels_intent_all, predict_intent_all) new_labels_slot_all = [] new_predict_slot_all = [] for a, b in zip(labels_slot_all, predict_slot_all): if a == b and a == 72: continue else: new_labels_slot_all.append(a) new_predict_slot_all.append(b) new_labels_slot_all = np.array(new_labels_slot_all) new_predict_slot_all = np.array(new_predict_slot_all) acc_slot = metrics.accuracy_score(new_labels_slot_all, new_predict_slot_all) if test: import os from utils import load_vocabulary # slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'test_slot_vocab')) # slot_vocab['rev'] = slot_vocab['rev'][0:72] intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab')) report_intent = metrics.classification_report(labels_intent_all, predict_intent_all, target_names=intent_vocab['rev'], digits=4) # report_slot = metrics.classification_report(new_labels_slot_all, new_predict_slot_all, # target_names=slot_vocab['rev'], digits=4) # print(report_slot) confusion_intent = metrics.confusion_matrix(labels_intent_all, predict_intent_all) confusion_slot = metrics.confusion_matrix(new_labels_slot_all, new_predict_slot_all) return acc_intent, loss_intent_total / len(data_iter), report_intent, confusion_intent, loss_slot_total / len( data_iter), acc_slot, confusion_slot return acc_intent, loss_intent_total / len(data_iter), acc_slot, loss_slot_total / len(data_iter)
def main(): parser = argparse.ArgumentParser(description='Attention-based NMT') parser.add_argument('SOURCE_VOCAB', help='source vocabulary file') parser.add_argument('TARGET_VOCAB', help='target vocabulary file') parser.add_argument('model_npz', help='model file') parser.add_argument('--validation-source', help='source sentence list for validation') parser.add_argument('--validation-target', help='target sentence list for validation') parser.add_argument('--batchsize', '-b', type=int, default=128, help='number of sentence pairs in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--resume', '-r', default='', help='resume the training from snapshot') parser.add_argument('--encoder-unit', type=int, default=128, help='number of units') parser.add_argument('--encoder-layer', type=int, default=3, help='number of layers') parser.add_argument('--encoder-dropout', type=int, default=0.1, help='number of layers') parser.add_argument('--decoder-unit', type=int, default=128, help='number of units') parser.add_argument('--attention-unit', type=int, default=128, help='number of units') parser.add_argument('--maxout-unit', type=int, default=128, help='number of units') parser.add_argument('--min-source-sentence', type=int, default=1, help='minimium length of source sentence') parser.add_argument('--max-source-sentence', type=int, default=50, help='maximum length of source sentence') parser.add_argument('--log-interval', type=int, default=200, help='number of iteration to show log') parser.add_argument('--validation-interval', type=int, default=4000, help='number of iteration to evlauate the model ' 'with validation dataset') parser.add_argument('--out', '-o', default='result', help='directory to output the result') parser.add_argument('--debug', action='store_true', help='use a small part of training data') args = parser.parse_args() source_ids = load_vocabulary(args.SOURCE_VOCAB) target_ids = load_vocabulary(args.TARGET_VOCAB) target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} model = Seq2seq(len(source_ids), len(target_ids), args.encoder_layer, args.encoder_unit, args.encoder_dropout, args.decoder_unit, args.attention_unit, args.maxout_unit) chainer.serializers.load_npz(args.model_npz, model) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) m = MeCab('-Owakati') while True: line = input('> ') words = m.parse(line).split() words.append('<EOS>') x = np.zeros((1, len(words)), dtype=np.int32) for i in range(len(words)): x[0, i] = source_ids.get(words[i], UNK) result = model.translate(x) o_words = [] for i in range(len(result[0])): o_words.append(target_words.get(result[0][i], '<unk>')) if o_words[-1] == '<EOS>': o_words.pop() break print(" ".join(o_words))
img_embedding_size = 4096 paths = { "ckpt": "./ckpt/mae.ckpt.batch2500", "test_data": "./data/test", "vocab_word": "./data/vocab_word.txt", "vocab_attr": "./data/vocab_attr.txt", "vocab_value": "./data/vocab_value.txt", "image_vector": "./data/image_fc_vectors.npy" } use_image = False print("load data...") w2i_word, i2w_word = load_vocabulary(paths["vocab_word"]) w2i_attr, i2w_attr = load_vocabulary(paths["vocab_attr"]) w2i_value, i2w_value = load_vocabulary(paths["vocab_value"]) data_processor = DataProcessor( paths["test_data"] + "/input.seq", paths["test_data"] + "/input.imageindex", paths["test_data"] + "/input.attr", paths["test_data"] + "/output.value", w2i_word, w2i_attr, w2i_value, shuffling=False ) if use_image:
outputs = np.array(outputs, dtype=np.int16).reshape((len(inputs), len(punctuations))) f = h5py.File(output_path + '.h5', "w") dset = f.create_dataset('inputs', data=inputs, dtype='i8') dset = f.create_dataset('outputs',data=outputs, dtype='i8') data = {"vocabulary": vocabulary, "punctuations": punctuations, "total_size": len(inputs)} with open(output_path + '.pkl', 'wb') as output_file: cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL) PHASE1_TRAIN_PATH = "../data/train1" PHASE1_DEV_PATH = "../data/dev1" PUNCTUATIONS = {" ": 0, ".PERIOD": 1, ",COMMA": 2} VOCABULARY_FILE = "../raw_data/vocab" TRAIN_DATA = "../raw_data/train.txt" DEV_DATA = "../raw_data/dev.txt" if not os.path.exists("../data"): os.makedirs("../data") print("Converting data...") vocabulary = utils.load_vocabulary(VOCABULARY_FILE) convert_files([TRAIN_DATA], vocabulary, PUNCTUATIONS, PHASE1_TRAIN_PATH) convert_files([DEV_DATA], vocabulary, PUNCTUATIONS, PHASE1_DEV_PATH)
def main(_): vocab = load_vocabulary(FLAGS.data_dir) data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) update_rating, update_review, global_step = train_fn(model) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, FLAGS.num_epochs + 1): log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs)) count = 0 sum_rating_loss = 0 sum_review_loss = 0 # Training for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True): count += 1 fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True) _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd) sum_rating_loss += _rating_loss review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.train_review) img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.train_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images, reviews=reviews, is_training=True) _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd) sum_review_loss += _review_loss if _step % FLAGS.display_step == 0: data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count), review_loss=(sum_review_loss / count)) # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.test_review) img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) save_path = saver.save(sess, f"tmp/model{epoch}.ckpt") log_info(log_file, '')
def main(_): vocab = load_vocabulary(FLAGS.data_dir) if FLAGS.generating: data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True) else: data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, FLAGS.ckpt_dir) print('Model succesfully restored') # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.real_test_review) img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.real_test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] if FLAGS.generating: for gen, ref in zip(gen_reviews, ref_reviews): gen_str = "GENERATED:\n"+" ".join(gen) ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n" log_info(log_file,gen_str) log_info(log_file,ref_str) for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
def convert_file(file_path, vocab_file, punct_file, output_path): punctuations = {" ":0, ".":1, ",":2} punctuations = utils.load_punctuations(punct_file) vocabulary = utils.load_vocabulary(vocab_file) punctuation = " " time_steps = 1 #to be used in future experiments filename = 'database' # output file name f = h5py.File(os.path.join(output_path, filename+'.h5'), "w") input_dset = f.create_dataset('inputs', (100, time_steps,len(vocabulary)), dtype='i8', maxshape=(None, time_steps, len(vocabulary))) output_dset = f.create_dataset('outputs', (100, len(punctuations)), dtype='i8', maxshape=(None, len(punctuations))) data_counter = 0 with open(file_path, 'r') as corpus: for line in corpus: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<START>")] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, " ")] = 1 output_dset[data_counter] = array data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) for token in line.split(): if token in punctuations: punctuation = token continue else: array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, token)] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 output_dset[data_counter] = array punctuation = " " data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) array = np.zeros(shape=(1, len(vocabulary)), dtype=np.int8) array[0,utils.input_word_index(vocabulary, "<END>")] = 1 input_dset[data_counter] = array array = np.zeros(shape=(1, len(punctuations)), dtype=np.int8) array[0,utils.punctuation_index(punctuations, punctuation)] = 1 output_dset[data_counter] = array data_counter += 1 if data_counter == input_dset.shape[0]: input_dset.resize(input_dset.shape[0]+1000, axis=0) output_dset.resize(output_dset.shape[0]+1000, axis=0) input_dset.resize(data_counter, axis=0) output_dset.resize(data_counter, axis=0) data = {"vocabulary": vocabulary, "punctuations": punctuations, "total_size": data_counter} with open(os.path.join(output_path, filename+'.pkl'), 'wb') as output_file: cPickle.dump(data, output_file, protocol=cPickle.HIGHEST_PROTOCOL) print("Done!")
def main(): parser = argparse.ArgumentParser(description='CKBC') parser.add_argument('TRAIN', help='training dataset') parser.add_argument('CONCEPT_VOCAB', help='concept vocabulary') parser.add_argument('RELATION_VOCAB', help='relation vocabulary') parser.add_argument('--validation1', help='validation dataset (1)') parser.add_argument('--validation2', help='validation dataset (2)') parser.add_argument('--test', help='test dataset') parser.add_argument('--batchsize', '-b', type=int, default=128, help='number of sentence pairs in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--concept-unit', type=int, default=256, help='number of concept units') parser.add_argument('--relation-unit', type=int, default=256, help='number of relation units') parser.add_argument('--dropout', type=int, default=0.1, help='number of layers') parser.add_argument('--log-interval', type=int, default=200, help='number of iteration to show log') parser.add_argument('--embedding', default='', help='path to pretrained word embedding') parser.add_argument('--finetune-embedding', action='store_true', help='finetune pretrained embedding') parser.add_argument('--validation-interval', type=int, default=4000, help='number of iteration to evlauate the model ' 'with validation dataset') parser.add_argument('--out', '-o', default='result', help='directory to output the result') parser.add_argument('--debug', action='store_true', help='use a small part of training data') args = parser.parse_args() concept_ids = load_vocabulary(args.CONCEPT_VOCAB) relation_ids = load_vocabulary(args.RELATION_VOCAB) train_facts = load_data( concept_ids, relation_ids, args.TRAIN, debug=args.debug ) train_data = [(h, r, t, y) for h, r, t, y in six.moves.zip(*train_facts)] train_head_unk = calculate_unknown_ratio( [h for h, _, _, _ in train_data] ) train_relation_unk = calculate_unknown_ratio( [r for _, r, _, _ in train_data] ) train_tail_unk = calculate_unknown_ratio( [t for _, _, t, _ in train_data] ) embedding = load_embedding(args.embedding, concept_ids) \ if args.embedding else None n_embed = embedding.shape[1] \ if embedding is not None else args.concept_unit print('Concept vocabulary size: %d' % len(concept_ids)) print('Relation vocabulary size: %d' % len(relation_ids)) print('Train data size: %d' % len(train_data)) print('Train head unknown: %.2f' % train_head_unk) print('Train relation unknown: %.2f' % train_relation_unk) print('Train tail unknown: %.2f' % train_tail_unk) if args.embedding: print('Pretrained word embedding: %s' % args.embedding) print('Fine-tune word embedding: %s' % args.finetune_embedding) model = BilinearAVG( len(concept_ids), len(relation_ids), n_embed, args.relation_unit, args.dropout, embedding=embedding ) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) optimizer = chainer.optimizers.Adam() optimizer.setup(model) if args.embedding != '' and not args.finetune_embedding: print('Freezing word embeddings...') model.concept_encoder.disable_update() train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) updater = training.StandardUpdater( train_iter, optimizer, converter=fact_pad_concat_convert, device=args.gpu ) trainer = training.Trainer(updater, (args.epoch, 'epoch')) trainer.extend( extensions.LogReport(trigger=(args.log_interval, 'iteration')) ) trainer.extend( extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'validation/main/loss', 'validation/main/accuracy', 'validation/main/threshold', 'elapsed_time'] ), trigger=(args.log_interval, 'iteration') ) if args.validation1 and args.validation2: test_facts = load_data( concept_ids, relation_ids, args.validation1 ) test_data1 = [(h, r, t, y) for h, r, t, y in six.moves.zip(*test_facts)] test_head_unk = calculate_unknown_ratio( [h for h, _, _, _ in test_data1] ) test_relation_unk = calculate_unknown_ratio( [r for _, r, _, _ in test_data1] ) test_tail_unk = calculate_unknown_ratio( [t for _, _, t, _ in test_data1] ) print('Validation data: %d' % len(test_data1)) print('Validation head unknown: %.2f' % test_head_unk) print('Validation relation unknown: %.2f' % test_relation_unk) print('Validation tail unknown: %.2f' % test_tail_unk) test_facts = load_data( concept_ids, relation_ids, args.validation2 ) test_data2 = [(h, r, t, y) for h, r, t, y in six.moves.zip(*test_facts)] test_head_unk = calculate_unknown_ratio( [h for h, _, _, _ in test_data2] ) test_relation_unk = calculate_unknown_ratio( [r for _, r, _, _ in test_data2] ) test_tail_unk = calculate_unknown_ratio( [t for _, _, t, _ in test_data2] ) print('Validation data: %d' % len(test_data2)) print('Validation head unknown: %.2f' % test_head_unk) print('Validation relation unknown: %.2f' % test_relation_unk) print('Validation tail unknown: %.2f' % test_tail_unk) trainer.extend( CalculateAccuracy( model, test_data1, test_data2, device=args.gpu, key_accuracy='validation/main/accuracy', key_threshold='validation/main/threshold' ), trigger=(args.validation_interval, 'iteration') ) print('start training') trainer.run()
# Create date : 2020/12/31 17:07 # IDE : pycharm #===================================== import tensorflow as tf import os from model_lstm_crf import MyModel from utils import DataProcessor_LSTM as DataProcessor from utils import load_vocabulary from utils import extract_kvpairs_in_bio from utils import cal_f1_score os.environ['CUDA_VISIBLE_DEVICES'] = '3' lstm_crf_ckpt = "models/" base_dir = "./data/ner_data" w2i_char, i2w_char = load_vocabulary(os.path.join(base_dir, "vocab.txt")) w2i_bio, i2w_bio = load_vocabulary(os.path.join(base_dir, "vocab_bio.txt")) data_processor = DataProcessor( os.path.join(base_dir, 'valid.txt'), os.path.join(base_dir, "valid_bio.txt"), w2i_char, w2i_bio, shuffling=True ) model = MyModel(embedding_dim=300, hidden_dim=300, vocab_size_char=len(w2i_char), vocab_size_bio=len(w2i_bio),
elif config.dataset == 'atis': print('use atis dataset') model_name = 'capsule' from utils import build_dataset, build_iterator, get_time_dif, load_vocabulary, build_vocab torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True start_time = time.time() print('加载数据...') build_vocab(config.input_file, os.path.join(config.vocab_path, 'in_vocab')) build_vocab(config.slot_file, os.path.join(config.vocab_path, 'slot_vocab')) build_vocab(config.intent_file, os.path.join(config.vocab_path, 'intent_vocab'), pad=False, unk=False) in_vocab = load_vocabulary(os.path.join(config.vocab_path, 'in_vocab')) slot_vocab = load_vocabulary(os.path.join(config.vocab_path, 'slot_vocab')) intent_vocab = load_vocabulary(os.path.join(config.vocab_path, 'intent_vocab')) train_data, dev_data, test_data = build_dataset(in_vocab['vocab'], slot_vocab['vocab'], intent_vocab['vocab']) train_iter = build_iterator(train_data) dev_iter = build_iterator(dev_data) test_iter = build_iterator(test_data) time_dif = get_time_dif(start_time) print('time usage:', time_dif) config.n_vocab = len(in_vocab['vocab']) x = import_module(model_name) model = x.Model(config).to(torch.device('cuda')) init_network(model)
if os.path.exists(log_file_path): os.remove(log_file_path) logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S") chlr = logging.StreamHandler() chlr.setFormatter(formatter) fhlr = logging.FileHandler(log_file_path) fhlr.setFormatter(formatter) logger.addHandler(chlr) logger.addHandler(fhlr) logger.info("loading vocab...") w2i_char, i2w_char = load_vocabulary("data/vocab_char.txt") w2i_bio, i2w_bio = load_vocabulary("data/vocab_bio.txt") w2i_attr, i2w_attr = load_vocabulary("data/vocab_attr.txt") logger.info("loading data...") data_processor_train = DataProcessor("data/train/input.seq.char", "data/train/output.seq.bio", "data/train/output.seq.attr", w2i_char, w2i_bio, w2i_attr, shuffling=True) data_processor_valid = DataProcessor("data/test/input.seq.char", "data/test/output.seq.bio", "data/test/output.seq.attr", w2i_char,
"img_block_num": 49, # # num of regional image features (7×7=49) "attn_size": 200, # hidden dim in attention "batch_size": 128, # batch size "dropout_prob": 0 # probability of dropout layers } paths = { "ckpt": "./ckpt/model.ckpt", "vocab": "./vocab", "embedded": "./data/embedded", "train_data": "./data/train", "valid_data": "./data/valid", "test_data": "./data/test" } w2i_word, i2w_word = load_vocabulary(paths["vocab"] + "/vocab.word") w2i_bio, i2w_bio = load_vocabulary(paths["vocab"] + "/vocab.bio") w2i_label, i2w_label = load_vocabulary(paths["vocab"] + "/vocab.label") # embedding_container: restore all vectors encoded by pre-trained bert and resnet embedding_container = EmbeddingContainer( paths["embedded"] + "/sids_of_txts", # indexes to find text encoded vector paths["embedded"] + "/txts.embedded.npy", # text encoded by pre-trained bert, shape=[N, max_len_of_word_seqs, dim_of_bert_output] paths["embedded"] + "/txts.embeddedG.npy", # vectors of [CLS] encoded by a pre-trained bert, shape=[N, dim_of_bert_output] paths["embedded"] + "/cids_of_imgs", # indexes to find image encoded vector paths["embedded"] + "/imgs.embedded.npy", # image encoded by pre-trained resnet, shape=[N, image_region_num, dim_of_resnet_output]
# set logging log_file_path = "./ckpt/run.log" if os.path.exists(log_file_path): os.remove(log_file_path) logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S") chlr = logging.StreamHandler() chlr.setFormatter(formatter) fhlr = logging.FileHandler(log_file_path) fhlr.setFormatter(formatter) logger.addHandler(chlr) logger.addHandler(fhlr) logger.info("loading vocab...") w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt") w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bioattr.txt") logger.info("loading data...") data_processor_train = DataProcessor("./data/train/input.seq.char", "./data/train/output.seq.bioattr", w2i_char, w2i_bio, shuffling=True) data_processor_valid = DataProcessor("./data/test/input.seq.char", "./data/test/output.seq.bioattr", w2i_char, w2i_bio, shuffling=True)
def main(): parser = argparse.ArgumentParser(description='Attention-based NMT') parser.add_argument('SOURCE', help='source sentence list') parser.add_argument('TARGET', help='target sentence list') parser.add_argument('SOURCE_VOCAB', help='source vocabulary file') parser.add_argument('TARGET_VOCAB', help='target vocabulary file') parser.add_argument('--validation-source', help='source sentence list for validation') parser.add_argument('--validation-target', help='target sentence list for validation') parser.add_argument('--batchsize', '-b', type=int, default=128, help='number of sentence pairs in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--resume', '-r', default='', help='resume the training from snapshot') parser.add_argument('--encoder-unit', type=int, default=128, help='number of units') parser.add_argument('--encoder-layer', type=int, default=3, help='number of layers') parser.add_argument('--encoder-dropout', type=int, default=0.1, help='number of layers') parser.add_argument('--decoder-unit', type=int, default=128, help='number of units') parser.add_argument('--attention-unit', type=int, default=128, help='number of units') parser.add_argument('--maxout-unit', type=int, default=128, help='number of units') parser.add_argument('--min-source-sentence', type=int, default=1, help='minimium length of source sentence') parser.add_argument('--max-source-sentence', type=int, default=50, help='maximum length of source sentence') parser.add_argument('--log-interval', type=int, default=200, help='number of iteration to show log') parser.add_argument('--validation-interval', type=int, default=4000, help='number of iteration to evlauate the model ' 'with validation dataset') parser.add_argument('--out', '-o', default='result', help='directory to output the result') parser.add_argument('--debug', action='store_true', help='use a small part of training data') args = parser.parse_args() source_ids = load_vocabulary(args.SOURCE_VOCAB) target_ids = load_vocabulary(args.TARGET_VOCAB) train_source = load_data(source_ids, args.SOURCE, debug=args.debug) train_target = load_data(target_ids, args.TARGET, debug=args.debug) assert len(train_source) == len(train_target) train_data = [(s, t) for s, t in six.moves.zip(train_source, train_target) if args.min_source_sentence <= len(s) <= args.max_source_sentence and args.min_source_sentence <= len(t) <= args.max_source_sentence] train_source_unk = calculate_unknown_ratio( [s for s, _ in train_data] ) train_target_unk = calculate_unknown_ratio( [t for _, t in train_data] ) print('Source vocabulary size: {}'.format(len(source_ids))) print('Target vocabulary size: {}'.format(len(target_ids))) print('Train data size: {}'.format(len(train_data))) print('Train source unknown: {0:.2f}'.format(train_source_unk)) print('Train target unknown: {0:.2f}'.format(train_target_unk)) target_words = {i: w for w, i in target_ids.items()} source_words = {i: w for w, i in source_ids.items()} model = Seq2seq(len(source_ids), len(target_ids), args.encoder_layer, args.encoder_unit, args.encoder_dropout, args.decoder_unit, args.attention_unit, args.maxout_unit) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() model.to_gpu(args.gpu) optimizer = chainer.optimizers.Adam() optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(train_data, args.batchsize) updater = training.StandardUpdater( train_iter, optimizer, converter=seq2seq_pad_concat_convert, device=args.gpu ) trainer = training.Trainer(updater, (args.epoch, 'epoch')) trainer.extend( extensions.LogReport(trigger=(args.log_interval, 'iteration')) ) trainer.extend( extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'validation/main/loss', 'main/perp', 'validation/main/perp', 'validation/main/bleu', 'elapsed_time'] ), trigger=(args.log_interval, 'iteration') ) if args.validation_source and args.validation_target: test_source = load_data(source_ids, args.validation_source) test_target = load_data(target_ids, args.validation_target) assert len(test_source) == len(test_target) test_data = list(six.moves.zip(test_source, test_target)) test_data = [(s, t) for s, t in test_data if 0 < len(s) and 0 < len(t)] test_source_unk = calculate_unknown_ratio( [s for s, _ in test_data] ) test_target_unk = calculate_unknown_ratio( [t for _, t in test_data] ) print('Validation data: {}'.format(len(test_data))) print('Validation source unknown: {0:.2f}'.format(test_source_unk)) print('Validation target unknown: {0:.2f}'.format(test_target_unk)) @chainer.training.make_extension() def translate(_): source, target = seq2seq_pad_concat_convert( [test_data[numpy.random.choice(len(test_data))]], args.gpu ) result = model.translate(source)[0].reshape(1, -1) source, target, result = source[0], target[0], result[0] source_sentence = ' '.join([source_words[int(x)] for x in source]) target_sentence = ' '.join([target_words[int(y)] for y in target]) result_sentence = ' '.join([target_words[int(y)] for y in result]) print('# source : ' + source_sentence) print('# result : ' + result_sentence) print('# expect : ' + target_sentence) trainer.extend( translate, trigger=(args.validation_interval, 'iteration') ) trainer.extend( CalculateBleu( model, test_data, device=args.gpu, key='validation/main/bleu' ), trigger=(args.validation_interval, 'iteration') ) print('start training') trainer.run() chainer.serializers.save_npz('%s/model.npz' % args.out, model)
else: print("use own dataset: ", arg.dataset) full_train_path = os.path.join("./data", arg.dataset, arg.train_data_path) full_test_path = os.path.join('./data', arg.dataset, arg.test_data_path) full_valid_path = os.path.join('./data', arg.dataset, arg.valid_data_path) create_vocabulary(os.path.join(full_train_path, arg.input_file), os.path.join(arg.vocab_path, "in_vocab")) create_vocabulary(os.path.join(full_train_path, arg.slot_file), os.path.join(arg.vocab_path, "slot_vocab")) create_vocabulary(os.path.join(full_train_path, arg.intent_file), os.path.join(arg.vocab_path, "intent_vocab")) # {word 2 id, words list} in_vocab = load_vocabulary(os.path.join(arg.vocab_path, "in_vocab")) slot_vocab = load_vocabulary(os.path.join(arg.vocab_path, "slot_vocab")) intent_vocab = load_vocabulary(os.path.join(arg.vocab_path, "intent_vocab")) def create_model(input_data, input_size, sequence_length, slot_size, intent_size, layer_size=128, is_training=True): """ input_data: 输入数据[batch, len] input_size: 输入数据中单词的个数 sequence_length: 数据的长度[batch]
# set logging log_file_path = "./ckpt/run.log" if os.path.exists(log_file_path): os.remove(log_file_path) logger = logging.getLogger() logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s | %(message)s", "%Y-%m-%d %H:%M:%S") chlr = logging.StreamHandler() chlr.setFormatter(formatter) fhlr = logging.FileHandler(log_file_path) fhlr.setFormatter(formatter) logger.addHandler(chlr) logger.addHandler(fhlr) logger.info("loading vocab...") w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt") w2i_word, i2w_word = load_vocabulary("./data/vocab_word.txt") w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bio.txt") w2i_attr, i2w_attr = load_vocabulary("./data/vocab_attr.txt") logger.info("loading data...") data_processor_train = DataProcessor("./data/train/input.seq.char", "./data/train/input.seq.word", "./data/train/output.seq.bio", "./data/train/output.seq.attr", w2i_char, w2i_word, w2i_bio, w2i_attr, shuffling=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--grad_clip', default=100, type=int) parser.add_argument('--lr', default=0.01, type=float) parser.add_argument('--batch_size', default=50, type=int) parser.add_argument('--num_epochs', default=50, type=int) parser.add_argument('--seq_len', default=60, type=int) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--model', default=None) parser.add_argument('--model_name_prefix', default='model') parser.add_argument('--language', default='hy-AM') parser.add_argument('--start_from', default=0, type=float) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language = args.language) (train_text, val_text, trans) = utils.load_language_data(language = args.language) data_size = len(train_text) print("Building Network ...") (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train = True) if args.model: f = np.load('languages/' + args.language + '/models/' + args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Training ...") step_cnt = 0 date_at_beginning = datetime.now() last_time = date_at_beginning for epoch in range(args.num_epochs): train_text = train_text.split(u'։') random.shuffle(train_text) train_text = u'։'.join(train_text) avg_cost = 0.0 count = 0 num_of_samples = 0 num_of_chars = 0 for (x, y) in utils.data_generator(train_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = True): sample_cost = train(x, np.reshape(y,(-1,vocab_size))) sample_cost = float(sample_cost) count += 1 num_of_samples += x.shape[0] num_of_chars += x.shape[0] * x.shape[1] time_now = datetime.now() if (time_now - last_time).total_seconds() > 60 * 1: # 10 minutes print('Computing validation loss...') val_cost = 0.0 val_count = 0.0 for ((x_val, y_val, indices, delimiters), non_valids_list) in utils.data_generator(val_text, args.seq_len, args.batch_size, trans, trans_to_index, char_to_index, is_train = False): val_cost += x_val.shape[0] *cost(x_val,np.reshape(y_val,(-1,vocab_size))) val_count += x_val.shape[0] print('Validation loss is {}'.format(val_cost/val_count)) file_name = 'languages/{}/models/{}.hdim{}.depth{}.seq_len{}.bs{}.time{:4f}.epoch{}.loss{:.4f}'.format(args.language, args.model_name_prefix, args.hdim, args.depth, args.seq_len, args.batch_size, (time_now - date_at_beginning).total_seconds()/60, epoch, val_cost/val_count) print("saving to -> " + file_name) np.save(file_name, lasagne.layers.get_all_param_values(output_layer)) last_time = datetime.now() print("On step #{} loss is {:.4f}, samples passed {}, chars_passed {}, {:.4f}% of an epoch {} time passed {:4f}"\ .format(count, sample_cost, num_of_samples, num_of_chars, 100.0*num_of_chars/len(train_text), epoch, (time_now - date_at_beginning).total_seconds()/60.0)) avg_cost += sample_cost
default=0, help='1 for test, or for training') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--resume', default='insurance/V2/checkpoints/model_best.tar', type=str, metavar='PATH', help='path saved params') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) PAD = '<PAD>' id_to_word, label_to_ans, label_to_ans_text = load_vocabulary( 'insuranceQA/V2/vocabulary', 'insuranceQA/V2/InsuranceQA.label2answer.token.encoded') w2i = {w: i for i, w in enumerate(id_to_word.values(), 1)} w2i[PAD] = 0 vocab_size = len(w2i) print('vocab_size:', vocab_size) train_data = load_data( 'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.train.encoded', id_to_word, label_to_ans_text) test_data = load_data2( 'insuranceQA/V2/InsuranceQA.question.anslabel.token.500.pool.solr.test.encoded', id_to_word, label_to_ans_text) print('n_train:', len(train_data)) print('n_test:', len(test_data))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hdim', default=512, type=int) parser.add_argument('--grad_clip', default=100, type=int) parser.add_argument('--lr', default=0.01, type=float) parser.add_argument('--batch_size', default=50, type=int) parser.add_argument('--num_epochs', default=10, type=int) parser.add_argument('--seq_len', default=60, type=int) parser.add_argument('--depth', default=1, type=int) parser.add_argument('--model', default=None) parser.add_argument('--model_name_prefix', default='model') parser.add_argument('--language', default='hy-AM') parser.add_argument('--start_from', default=0, type=float) args = parser.parse_args() print("Loading Files") (char_to_index, index_to_char, vocab_size, trans_to_index, index_to_trans, trans_vocab_size) = utils.load_vocabulary(language=args.language) (train_text, val_text, trans) = utils.load_language_data(language=args.language) data_size = len(train_text) print("Building Network ...") (output_layer, train, cost) = utils.define_model(args.hdim, args.depth, args.lr, args.grad_clip, trans_vocab_size, vocab_size, is_train=True) if args.model: f = np.load('languages/' + args.language + '/models/' + args.model) param_values = [np.float32(f[i]) for i in range(len(f))] lasagne.layers.set_all_param_values(output_layer, param_values) print("Training ...") p = int(len(train_text) * args.start_from) + 1 step_cnt = 0 avg_cost = 0 it = 0 while it < args.num_epochs: avg_cost = 0 date_at_beginning = datetime.now() non_native_skipped = 0 for _ in range(PRINT_FREQ): x, y, p, turned, non_native_sequences = utils.gen_data( p, args.seq_len, args.batch_size, train_text, trans, trans_to_index, char_to_index) if turned: it += 1 avg_cost += train(x, np.reshape(y, (-1, vocab_size))) non_native_skipped += non_native_sequences date_after = datetime.now() print("Epoch {} average loss = {} Time {} sec. Nonnatives skipped {}". format(1.0 * it + 1.0 * p / data_size, avg_cost / PRINT_FREQ, (date_after - date_at_beginning).total_seconds(), non_native_skipped)) step_cnt += 1 if True: #step_cnt * args.batch_size > 100000: print('computing validation loss...') val_turned = False val_p = 0 val_steps = 0. val_cost = 0. while not val_turned: x, y, val_p, val_turned, non_native = utils.gen_data( val_p, args.seq_len, args.batch_size, val_text, trans, trans_to_index, char_to_index) val_steps += 1 val_cost += cost(x, np.reshape(y, (-1, vocab_size))) print('validation loss is ' + str(val_cost / val_steps)) file_name = 'languages/' + args.language + '/models/' + args.model_name_prefix + '.hdim' + str( args.hdim) + '.depth' + str(args.depth) + '.seq_len' + str( args.seq_len) + '.bs' + str( args.batch_size) + '.epoch' + str( 1.0 * it + 1.0 * p / data_size) + '.loss' + str( avg_cost / PRINT_FREQ) + '.npz' print("saving to -> " + file_name) np.save(file_name, lasagne.layers.get_all_param_values(output_layer)) step_cnt = 0
np.random.seed(conf.RANDOM_SEED) t0 = time() ### convert data ### if not os.path.exists( "/Users/mayili/Documents/intern/NLP/punctuation/punctuator-master/data" ): print("Converting data...\n") os.makedirs( "/Users/mayili/Documents/intern/NLP/punctuation/punctuator-master/data" ) vocabulary = utils.load_vocabulary(conf.VOCABULARY_FILE) converter.convert_files(conf.PHASE1["TRAIN_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, False, PHASE1_TRAIN_PATH) converter.convert_files(conf.PHASE1["DEV_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, False, PHASE1_DEV_PATH) # ============================================================================= # if conf.PHASE2["TRAIN_DATA"] and conf.PHASE2["DEV_DATA"]: # converter.convert_files(conf.PHASE2["TRAIN_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, conf.PHASE2["USE_PAUSES"], PHASE2_TRAIN_PATH) # converter.convert_files(conf.PHASE2["DEV_DATA"], vocabulary, conf.PUNCTUATIONS, conf.BATCH_SIZE, conf.PHASE2["USE_PAUSES"], PHASE2_DEV_PATH) # ============================================================================= ### train model ### print("Training model...\n")