def train_main(args): """ trains model specfied in args. main method for train subcommand. """ # load text text = load_text(args.text_path) if args.test_path: test_text = load_text(args.test_path) else: test_text = None # load or build model if args.restore: load_path = args.checkpoint_path if args.restore is True else args.restore model = load_model(load_path) logger.info("model restored: %s.", load_path) else: model = build_model(batch_size=args.batch_size, seq_len=args.seq_len, vocab_size=get_VOCAB_SIZE(), embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, drop_rate=args.drop_rate, learning_rate=args.learning_rate, clip_norm=args.clip_norm) # make and clear checkpoint directory log_dir = make_dirs(args.checkpoint_path, empty=True) model.save(args.checkpoint_path) logger.info("model saved: %s.", args.checkpoint_path) # callbacks callbacks = [ ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False), TensorBoard(log_dir, write_graph=True, embeddings_freq=1, embeddings_metadata={ "embedding_1": os.path.abspath(os.path.join("data", "id2char.tsv")) }), LoggerCallback(text, test_text, model, args.checkpoint_path) ] # training start num_batches = (len(text) - 1) // (args.batch_size * args.seq_len) model.reset_states() model.fit_generator(batch_generator(encode_text(text, get_CHAR2ID()), args.batch_size, args.seq_len, one_hot_labels=True), num_batches, args.num_epochs, callbacks=callbacks) return model
def preprocess(): """ Prepare data """ data = FLAGS.dataset text = load_text(data) vocab = sorted(set(text)) # Creating a mapping from unique characters to indices char2idx = {u:i for i, u in enumerate(vocab)} idx2char = np.array(vocab) text_as_int = np.array([char2idx[c] for c in text]) # The maximum length sentence we want for a single input in characters seq_length = 150 examples_per_epoch = len(text)//(seq_length+1) # Create training examples / targets char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # The batch method lets us easily convert these individual characters to sequences of the desired size. sequences = char_dataset.batch(seq_length+1, drop_remainder=FLAGS.drop_remainder) # For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch: dataset = sequences.map(split_input_target) # shuffle the data and pack it into batches. dataset = dataset.shuffle(FLAGS.buffer_size).batch(FLAGS.batch_size, drop_remainder=FLAGS.drop_remainder) return vocab, dataset
def test_main(args): test_text = load_text(args.test_path) model = retrieve_model(args) model.reset_states() bpc = calculate_bpc(model, test_text) print(bpc)
def generate_RI_text_fast(N, RI_letters, cluster_sz, ordered, text_name, alph=alphabet): text_vector = np.zeros((1, N)) text = utils.load_text(text_name) cluster2 = '' vector = np.ones((1,N)) for char_num in xrange(len(text)): cluster = cluster + text[char_num] if len(cluster) < cluster_sz: continue elif len(cluster) > cluster_sz: prev_letter = cluster[0] prev_letter_idx = alphabet.find(letter) inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1) vector = np.multiply(vector, inverse) vector = np.roll(vector, 1) letter = text[char_num] letter_idx = alphabet.find(letter) vector = np.multiply(vector, RI_letters[letter_idx,:]) cluster = cluster[1:] else: # (len(cluster) == cluster_size), happens once letters = list(cluster) for letter in letters: vector = np.roll(vector,1) letter_idx = alphabet.find(letter) vector = np.multiply(vector, RI_letters[letter_idx,:]) text_vector += vector return text_vector
def __init__(self, master, prev_sc, main_bg): # 1. Initilising GUI Components self.update_screen(master,main_bg) self.update_variables(prev_sc) self.start_log = "---------------------------------\n" + \ "| LOG STAGE 2 START SCREEN |\n" + \ "---------------------------------" self.start_txt = "| Start Action Button Pressed |" self.exit_txt = "| Exit Button Pressed |" print(self.start_log) # 2. Setting the Screen Components self.title = tkinter.Label(master, bg='white',\ fg = 'black', text='FASE 2', font=Font(family='Helvetica', size=30, weight='bold')) self.title.place(x=self.sw/2,y=2*self.sh/10,anchor='center') # a. Start Button self.start_button = Button(master, anchor = 'center', compound = 'center', text = 'JOGAR',font = Font(family='Helvetica', size=28, weight='bold'), bg = "#%02x%02x%02x" % (30, 30, 30), fg = 'white', command = self.start_button_click, highlightthickness = 0, bd = 0, padx=0,pady=0,height=2,width=13) self.start_button.place(x = self.sw/2, y = 8*self.sh/10, anchor= 'center') # b. Stage 1 Text text = utils.load_text(2) self.text_display = scrolledtext.ScrolledText(master, fg = 'black', font = Font(family='Helvetica', size=18),\ bg = "#%02x%02x%02x" % (255, 255, 255), insertbackground = 'black',\ highlightcolor = "#%02x%02x%02x" % (180,180,180), highlightbackground= "#%02x%02x%02x" % (50,50,50),\ bd=0, width =47, height=10, padx=10, pady=10, wrap='word') self.text_display.insert('insert',text) self.text_display.configure(state='disabled') self.text_display.place(x=self.sw/2,y=self.sh/2,anchor='center')
def bio2typing(res_dir, test_fids, tag=0): res = non_integrated_results(res_dir, test_fids) merged_entities = extract_entities(res, test_fids) ner_typing_root = Path(NER_TYPING_ROOT) ner_typing_root.mkdir(parents=True, exist_ok=True) pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl") for test_fid in test_fids: pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{test_fid}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) ens = merged_entities[test_fid] fm, ls, ob = [], [], [] for en_idx, en in enumerate(ens): # ('son', 'FAMILYMEMBER', (334, 337), (342, 345)) en_span = en[-1] en_type = en[1].lower() sidx = get_sent_idx(en_span, sent_bound) en_loc_sent = sents[sidx] pure_text = pre_txt[sidx] tagged_sent = insert_token_and_creat_text_for_testing( en_loc_sent, en_span) if valida_by_sent(tagged_sent): print(test_fid, en, tagged_sent) if en_type == "familymember": fm.append([ f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text, tagged_sent ]) elif en_type == "observation": ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) elif en_type == "livingstatus": ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent]) else: raise RuntimeError(f"{en_type} is not recognized for {en}") # # fms, fmr share the same dir pfm = Path(FMS_TEST.format(tag)) pfm.mkdir(exist_ok=True, parents=True) to_tsv(fm, pfm / "test.tsv") pfo = Path(OBN_TEST.format(tag)) pfo.mkdir(exist_ok=True, parents=True) to_tsv(ob, pfo / "test.tsv") pfl = Path(LSS_TEST.format(tag)) pfl.mkdir(exist_ok=True, parents=True) to_tsv(ls, pfl / "test.tsv") pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl") pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl") pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
def bio2relation(): TAG = "pred" sdiff = [] relation_types = [] pred_relations_plan1 = [] pred_relations_plan2 = [] mapping = [] typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl") for doc_id, ens in typed_entities.items(): pre_txt = load_text( Path(PREPROCESSED_TEXT_DIR) / f"{doc_id}.preprocessed.txt").split("\n") sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl") sent_bound = creat_sent_altered_boundary(sents) enids = range(len(ens)) all_pairs = [] for e1, e2 in permutations(enids, 2): all_pairs.append((e1, e2)) for each in all_pairs: eid1, eid2 = each # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER') en1 = ens[eid1] en2 = ens[eid2] if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper( ) == "FAMILYMEMBER": continue sie1 = get_sent_idx(en1[0][3], sent_bound) sie2 = get_sent_idx(en2[0][3], sent_bound) if abs(sie1 - sie2) > GLOBAL_CUTOFF: continue bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2], en1[0], en2[0]) tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels pred_relations_plan1.append([ TAG, tagged_s1, tagged_s2, pure_text1, pure_text2, f"{abs(sie1 - sie2)}", str() ]) tp = generate_bert_relation_without_extra_sentence( sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2) pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"]) mapping.append((doc_id, en1, en2)) prel = Path(REL_TEST) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan2, prel / "test.tsv") prel = Path(REL_TESTa) prel.mkdir(parents=True, exist_ok=True) pkl_save(mapping, prel / "relation_mappings.tsv") to_tsv(pred_relations_plan1, prel / "test.tsv")
def train(self, training_directory): from utils import load_text text = load_text(training_directory) sentences = self._preprocess(text) # what if the same word appears twice in one sentence? do we discount that? for sentence in sentences: for word in sentence: self._update_mapping(word, sentence)
def match(self, x): input_text = [ load_text(i, self.max_sentence_len, self.input2idx, self.choice) for i in x ] input_text = np.asarray(input_text) res = self.model.predict(input_text) res = concat(res) res = self.decode(res, True) return res
def non_integrated_results(root, test_fids, original_offset_only=False): p = Path(root) ll = len(test_fids) result = dict() for fid in p.glob("*.txt"): fid_stem = fid.stem.split(".")[0] assert fid_stem in test_fids, f"{fid.stem} is not a test fid" ll -= 1 cont = load_text(fid) sents = text2sents(cont.strip(), original_offset_only) result[fid_stem] = sents assert ll == 0, f"missing {ll} prediction files" return result
def generate_RI_text_history(N, RI_letters, text_name, alph=alphabet): # generate RI vector for "text_name" # assumes text_name has .txt text_vector = np.zeros((1, N)) history_vector = np.zeros((1,N)) text = utils.load_text(text_name) for char_num in xrange(len(text)): char = text[char_num] letter_idx = alphabet.find(char) history_vector = 0.75*history_vector + RI_letters[letter_idx,:] text_vector += history_vector return text_vector
def generate_text_vector(N, RI_letters, cluster_sz, text_name): text_vector = np.zeros((1,N)) text = utils.load_text(text_name) for char_idx in xrange(len(text)-cluster_sz+1): sidx = char_idx eidx = char_idx+cluster_sz cluster = text[sidx:eidx] vector = np.ones((1,N)) for letter in cluster: letter_idx = alphabet.find(letter) vector = np.roll(vector, 1) vector = np.multiply(vector, RI_letters[letter_idx, :]) text_vector += vector return text_vector / (len(text)-cluster_sz+1)
def generate_vocab_lang_vectors(N, RI_letters, cluster_sz, ordered, text_name, min_, max_,alph=alphabet): text_vector = np.zeros((1, N)) vocab_vec = np.zeros((1,N)) text = utils.load_text(text_name) cluster = '' vector = np.ones((1,N)) for char_num in xrange(len(text)): cluster = cluster + text[char_num] if len(cluster) < cluster_sz: continue elif len(cluster) > cluster_sz: prev_letter = cluster[0] prev_letter_idx = alphabet.find(prev_letter) inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1) vector = np.multiply(vector, inverse) vector = np.roll(vector, 1) letter = text[char_num] letter_idx = alphabet.find(letter) vector = np.multiply(vector, RI_letters[letter_idx,:]) cluster = cluster[1:] else: # (len(cluster) == cluster_size), happens once letters = list(cluster) for letter in letters: vector = np.roll(vector,1) letter_idx = alphabet.find(letter) vector = np.multiply(vector, RI_letters[letter_idx,:]) if(np.dot(vocab_vec, vector.T)<-10000): text_vector += vector; else: vocab_vec+= vector; if(np.dot(vocab_vec, vector.T) > min_): while(True): if(np.dot(vocab_vec, vector.T)<-10000): break; else: vocab_vec -= vector; return text_vector, vocab_vec
def main(args): # -------------------------------------------------------- # Data logger.info('-' * 100) logger.info('Load data files') train_exs = utils.load_data(args, args.train_file, skip_no_answer=True) logger.info('Num train examples = %d' % len(train_exs)) dev_exs = utils.load_data(args, args.dev_file) logger.info('Num dev examples = %d' % len(dev_exs)) # If we are doing offician evals then we need to: # 1) Load the original text to retrieve spans from offsets. # 2) Load the (multiple) text answers for each question. if args.official_eval: dev_texts = utils.load_text(args.dev_json) dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs} dev_answers = utils.load_answers(args.dev_json) # -------------------------------------------------------- # Model logger.info('-' * 100) start_epoch = 0 logger.info('Training model from scratch...') model = init_from_scratch(args, train_exs, dev_exs) # Set up partial tuning of embeddings if args.tune_partial > 0: logger.info('-' * 100) logger.info('Counting %d most frequent question words' % args.tune_partial) top_words = utils.top_question_words(args, train_exs, model.word_dict) for word in top_words[:5]: logger.info(word) logger.info('...') for word in top_words[-6:-1]: logger.info(word) model.tune_embeddings([w[0] for w in top_words]) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') train_dataset = data.ReaderDataset(train_exs, model, single_answer=True) if args.sort_by_len: train_sampler = data.SortedBatchSampler( train_dataset.lengths(), args.batch_size, shuffle=True) # shuffle设置为true每个batch重新打乱顺序 else: train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, # 每个batch加载多少个样本 sampler=train_sampler, # 从数据集中提取样本的策略,如果指定,则忽略shuffle参数 num_workers=args.data_workers, # 用多少个子进程加载数据 collate_fn=vector.batchify, # 合并样本形成小批量 pin_memory=args.cuda, # 如果为true,那么数据加载器将张量复制到cuda固定的内存中,然后再返回 ) dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False) if args.sort_by_len: dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=vector.batchify, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG logger.info('-' * 100) logger.info('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # train train(args, train_loader, model, stats) # Validate unofficial (train) validate_unofficial(args, train_loader, model, stats, mode='train') # Validate unofficial (dev) result = validate_unofficial(args, dev_loader, model, stats, mode='dev') # Validate official if args.official_eval: result = validate_official(args, dev_loader, model, stats, dev_offsets, dev_texts, dev_answers) # Save best valid if result[args.valid_metric] > stats['best_valid']: logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(args.model_file) stats['best_valid'] = result[args.valid_metric]
from tensorflow.keras import callbacks from utils import get_batch_generator, get_data_generator from utils import load_text, tokenize, prepare_word_tokens from onehot import OneHotEncoder from model import s2s_model HIDDEN_SIZE = 512 ERR_RATE = 0.2 EPOCHS = 15 BATCH_SIZE = 256 DATA_DIR = './data' R_DROPUT = 0.2 if __name__ == '__main__': train_text = load_text(DATA_DIR) val_text = load_text(DATA_DIR, 'val') train_word_set = list(filter(None, set(tokenize(train_text)))) val_word_set = list(filter(None, set(tokenize(val_text)))) train_max_word_len = max([len(token) for token in train_word_set]) + 2 val_max_word_len = max([len(token) for token in val_word_set]) + 2 train_encoder_tokens, train_decoder_tokens, train_target_tokens = prepare_word_tokens( train_word_set, train_max_word_len, error_rate=ERR_RATE) val_encoder_tokens, val_decoder_tokens, val_target_tokens = prepare_word_tokens( val_word_set, val_max_word_len, error_rate=ERR_RATE) input_charset = set(' '.join(train_encoder_tokens)) target_charset = set(' '.join(train_decoder_tokens))
i = 1 while i != sentence_length: next_word = stochastic_sample(markov, word) if next_word == '<STOP>' or next_word == '<START>': next_word = stochastic_sample(markov, '<START>') sentence.append(next_word) word = next_word i += 1 while i == sentence_length: next_word = stochastic_sample(markov, word) if next_word == '<START>': next_word = stochastic_sample(markov, '<START>') if next_word == '<STOP>': break sentence.append(next_word) word = next_word return " ".join(sentence) if __name__ == "__main__": file = "corpus_data/cleaned/SS_TOKEN_complete.txt" corpus = load_text(file) markov = markov_histo(corpus) for i in range(10): walk = random_walk(markov, 10) print(walk)
def main(args): # -------------------------------------------------------------------------- # DATA logger.info('-' * 100) logger.info('Load data files') train_exs = utils.load_data(args, args.train_file, skip_no_answer=True) logger.info('Num train examples = %d' % len(train_exs)) dev_exs = utils.load_data(args, args.dev_file) logger.info('Num dev examples = %d' % len(dev_exs)) # If we are doing offician evals then we need to: # 1) Load the original text to retrieve spans from offsets. # 2) Load the (multiple) text answers for each question. if args.official_eval: dev_texts = utils.load_text(args.dev_json) dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs} dev_answers = utils.load_answers(args.dev_json) # -------------------------------------------------------------------------- # MODEL logger.info('-' * 100) start_epoch = 0 if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.info('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = ParagraphRanker.load_checkpoint(checkpoint_file, args) else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.info('Using pretrained model...') model = ParagraphRanker.load(args.pretrained, args) if args.expand_dictionary: logger.info('Expanding dictionary for new data...') # Add words in training + dev examples words = utils.load_words(args, train_exs + dev_exs) added = model.expand_dictionary(words) # Load pretrained embeddings for added words if args.embedding_file: model.load_embeddings(added, args.embedding_file, args.fasttext) else: logger.info('Training model from scratch...') model = init_from_scratch(args, train_exs, dev_exs) # Set up partial tuning of embeddings if args.tune_partial > 0: logger.info('-' * 100) logger.info('Counting %d most frequent question words' % args.tune_partial) top_words = utils.top_question_words( args, train_exs, model.word_dict ) for word in top_words[:5]: logger.info(word) logger.info('...') for word in top_words[-6:-1]: logger.info(word) model.tune_embeddings([w[0] for w in top_words]) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') train_dataset = data.RankerDataset(train_exs, model, args.neg_size, args.allowed_size) if args.sort_by_len: train_sampler = data.RankerBatchSampler(train_dataset.lengths(), args.batch_size, shuffle=True) else: train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=vector.ranker_train_batchify, pin_memory=args.cuda, ) dev_dataset = data.RankerDataset(dev_exs, model, neg_size=1, allowed_size=1000) if args.sort_by_len: dev_sampler = data.RankerBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=vector.ranker_dev_batchify, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG logger.info('-' * 100) logger.info('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # Train train(args, train_loader, model, stats) # Filtering by questions # pre_selected_docs = filter_docs(args, dev_loader) # Encode documents for dev docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev') # Rank encoded documents result = rank_docs(args, docs, qs, stats, mode='dev') # Save best valid if result[args.valid_metric] > stats['best_valid']: logger.info('Best valid: %s = %.3f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(args.model_file) stats['best_valid'] = result[args.valid_metric] # Ranker final evaluation docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev') result = rank_docs(args, docs, qs, stats, mode='dev')
from keras.preprocessing.sequence import pad_sequences import pickle from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D from keras.layers import Bidirectional, GlobalMaxPool1D, CuDNNLSTM from keras.layers import Lambda, Add from keras.models import Model import keras from sklearn.model_selection import train_test_split from LossHistory import LossHistory from keras.utils import np_utils import random embeddings_index = load_embedding() use_text_length = 20000 japanese_text = load_text(use_text_length) split_japanese_text = mecab_to_text(japanese_text) dictionary = make_word_dictionary(split_japanese_text, lower_bound=10) dictionary = clear_dictionary(dictionary, embeddings_index) #all_embs = np.stack(embeddings_index.values()) #emb_mean, emb_std = all_embs.mean(), all_embs.std() ## Tokenize the sentences tokenizer = Tokenizer(char_level=False) tokenizer.fit_on_texts(dictionary) """ with open("..\result\tokenizer.pkl", 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) """
from tensorflow.keras.models import load_model from utils import load_text, tokenize, get_padded_token, load_s2s_model, decode_sequences, prepare_word_tokens from model import s2s_model HIDDEN_SIZE = 512 ERR_RATE = 0.8 BATCH_SIZE = 256 DATA_DIR = './data' if __name__ == '__main__': # Prepare model encoder, decoder = load_s2s_model( 'test-no_reverse-hs-512_err-0.8_bs-256_e-30_drop-0.2.h5', HIDDEN_SIZE) text = load_text(DATA_DIR) word_set = list(filter(None, set(tokenize(text)))) max_word_len = max([len(token) for token in word_set]) + 2 train_enc_tokens, train_dec_tokens, _ = prepare_word_tokens( word_set, max_word_len, ERR_RATE) enc_charset = set(' '.join(train_enc_tokens)) dec_charset = set(' '.join(train_dec_tokens)) enc_oh = OneHotEncoder(enc_charset) dec_oh = OneHotEncoder(dec_charset) # Input decoding loop while True: sentence = input('\nEnter sentence to decode:\n') tokens = list(filter(None, tokenize(sentence))) nb_of_tokens = len(tokens)
def main(args): # -------------------------------------------------------------------------- # DATA logger.info('-' * 100) logger.info('Load data files') train_exs = utils.load_data(args, args.train_file, skip_no_answer=True) logger.info('Num train examples = %d' % len(train_exs)) dev_exs = utils.load_data(args, args.dev_file) logger.info('Num dev examples = %d' % len(dev_exs)) # If we are doing offician evals then we need to: # 1) Load the original text to retrieve spans from offsets. # 2) Load the (multiple) text answers for each question. if args.official_eval: dev_texts = utils.load_text(args.dev_json) dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs} dev_answers = utils.load_answers(args.dev_json) # -------------------------------------------------------------------------- # MODEL logger.info('-' * 100) start_epoch = 0 if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.info('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args) else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.info('Using pretrained model...') model = DocReader.load(args.pretrained, args) if args.expand_dictionary: logger.info('Expanding dictionary for new data...') # Add words in training + dev examples words = utils.load_words(args, train_exs + dev_exs) added_words = model.expand_dictionary(words) # Load pretrained embeddings for added words if args.embedding_file: model.load_embeddings(added_words, args.embedding_file) logger.info('Expanding char dictionary for new data...') # Add words in training + dev examples chars = utils.load_chars(args, train_exs + dev_exs) added_chars = model.expand_char_dictionary(chars) # Load pretrained embeddings for added words if args.char_embedding_file: model.load_char_embeddings(added_chars, args.char_embedding_file) else: logger.info('Training model from scratch...') model = init_from_scratch(args, train_exs, dev_exs) # Set up partial tuning of embeddings if args.tune_partial > 0: logger.info('-' * 100) logger.info('Counting %d most frequent question words' % args.tune_partial) top_words = utils.top_question_words( args, train_exs, model.word_dict ) for word in top_words[:5]: logger.info(word) logger.info('...') for word in top_words[-6:-1]: logger.info(word) model.tune_embeddings([w[0] for w in top_words]) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') train_dataset = data.ReaderDataset(train_exs, model, single_answer=True) if args.sort_by_len: train_sampler = data.SortedBatchSampler(train_dataset.lengths(), args.batch_size, shuffle=True) else: train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) # if args.use_sentence_selector: # train_batcher = vector.sentence_batchifier(model, single_answer=True) # batching_function = train_batcher.batchify # else: batching_function = vector.batchify train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=batching_function, pin_memory=args.cuda, ) dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False) if args.sort_by_len: dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) # if args.use_sentence_selector: # dev_batcher = vector.sentence_batchifier(model, single_answer=False) # batching_function = dev_batcher.batchify # else: batching_function = vector.batchify dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=batching_function, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG logger.info('-' * 100) logger.info('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} # -------------------------------------------------------------------------- # QUICKLY VALIDATE ON PRETRAINED MODEL if args.global_mode == "test": result1 = validate_unofficial(args, dev_loader, model, stats, mode='dev') result2 = validate_official(args, dev_loader, model, stats, dev_offsets, dev_texts, dev_answers) print(result2[args.valid_metric]) print(result1["exact_match"]) validate_adversarial(args, model, stats, mode="dev") exit(0) for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # Train train(args, train_loader, model, stats) # Validate unofficial (train) validate_unofficial(args, train_loader, model, stats, mode='train') # Validate unofficial (dev) result = validate_unofficial(args, dev_loader, model, stats, mode='dev') # Validate official if args.official_eval: result = validate_official(args, dev_loader, model, stats, dev_offsets, dev_texts, dev_answers) # Save best valid if args.valid_metric is None or args.valid_metric == 'None': model.save(args.model_file) elif result[args.valid_metric] > stats['best_valid']: logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(args.model_file) stats['best_valid'] = result[args.valid_metric]
def validate_adversarial(args, model, global_stats, mode="dev"): # create dataloader for dev sets, load thier jsons, integrate the function for idx, dataset_file in enumerate(args.adv_dev_json): predictions = {} logger.info("Validating Adversarial Dataset %s" % dataset_file) exs = utils.load_data(args, args.adv_dev_file[idx]) logger.info('Num dev examples = %d' % len(exs)) ## Create dataloader dev_dataset = data.ReaderDataset(exs, model, single_answer=False) if args.sort_by_len: dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) # if args.use_sentence_selector: # batching_function = vector.batchify_sentences # else: batching_function = vector.batchify dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=batching_function, pin_memory=args.cuda, ) texts = utils.load_text(dataset_file) offsets = {ex['id']: ex['offsets'] for ex in exs} answers = utils.load_answers(dataset_file) eval_time = utils.Timer() f1 = utils.AverageMeter() exact_match = utils.AverageMeter() examples = 0 bad_examples = 0 for ex in dev_loader: ex_id, batch_size = ex[-1], ex[0].size(0) chosen_offset = ex[-2] pred_s, pred_e, _ = model.predict(ex) for i in range(batch_size): if pred_s[i][0] >= len(offsets[ex_id[i]]) or pred_e[i][0] >= len(offsets[ex_id[i]]): bad_examples += 1 continue if args.use_sentence_selector: s_offset = chosen_offset[i][pred_s[i][0]][0] e_offset = chosen_offset[i][pred_e[i][0]][1] else: s_offset = offsets[ex_id[i]][pred_s[i][0]][0] e_offset = offsets[ex_id[i]][pred_e[i][0]][1] prediction = texts[ex_id[i]][s_offset:e_offset] predictions[ex_id[i]] = prediction ground_truths = answers[ex_id[i]] exact_match.update(utils.metric_max_over_ground_truths( utils.exact_match_score, prediction, ground_truths)) f1.update(utils.metric_max_over_ground_truths( utils.f1_score, prediction, ground_truths)) examples += batch_size logger.info('dev valid official for dev file %s : Epoch = %d | EM = %.2f | ' % (dataset_file, global_stats['epoch'], exact_match.avg * 100) + 'F1 = %.2f | examples = %d | valid time = %.2f (s)' % (f1.avg * 100, examples, eval_time.time())) orig_f1_score = 0.0 orig_exact_match_score = 0.0 adv_f1_scores = {} # Map from original ID to F1 score adv_exact_match_scores = {} # Map from original ID to exact match score adv_ids = {} all_ids = set() # Set of all original IDs f1 = exact_match = 0 dataset = json.load(open(dataset_file))['data'] for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: orig_id = qa['id'].split('-')[0] all_ids.add(orig_id) if qa['id'] not in predictions: message = 'Unanswered question ' + qa['id'] + ' will receive score 0.' # logger.info(message) continue ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = predictions[qa['id']] cur_exact_match = utils.metric_max_over_ground_truths(utils.exact_match_score, prediction, ground_truths) cur_f1 = utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths) if orig_id == qa['id']: # This is an original example orig_f1_score += cur_f1 orig_exact_match_score += cur_exact_match if orig_id not in adv_f1_scores: # Haven't seen adversarial example yet, so use original for adversary adv_ids[orig_id] = orig_id adv_f1_scores[orig_id] = cur_f1 adv_exact_match_scores[orig_id] = cur_exact_match else: # This is an adversarial example if (orig_id not in adv_f1_scores or adv_ids[orig_id] == orig_id or adv_f1_scores[orig_id] > cur_f1): # Always override if currently adversary currently using orig_id adv_ids[orig_id] = qa['id'] adv_f1_scores[orig_id] = cur_f1 adv_exact_match_scores[orig_id] = cur_exact_match orig_f1 = 100.0 * orig_f1_score / len(all_ids) orig_exact_match = 100.0 * orig_exact_match_score / len(all_ids) adv_exact_match = 100.0 * sum(adv_exact_match_scores.values()) / len(all_ids) adv_f1 = 100.0 * sum(adv_f1_scores.values()) / len(all_ids) logger.info("For the file %s Original Exact Match : %.4f ; Original F1 : : %.4f | " % (dataset_file, orig_exact_match, orig_f1) + "Adversarial Exact Match : %.4f ; Adversarial F1 : : %.4f " % (adv_exact_match, adv_f1))
def main(args): # -------------------------------------------------------------------------- # DATA logger.info('-' * 100) logger.info('Load data files') train_exs = utils.load_data(args, args.train_file, skip_no_answer=True) logger.info('Num train examples = %d' % len(train_exs)) dev_exs = utils.load_data(args, args.dev_file) logger.info('Num dev examples = %d' % len(dev_exs)) # If we are doing offician evals then we need to: # 1) Load the original text to retrieve spans from offsets. # 2) Load the (multiple) text answers for each question. if args.official_eval: dev_texts = utils.load_text(args.dev_json) dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs} dev_answers = utils.load_answers(args.dev_json) else: dev_texts = None dev_offsets = None dev_answers = None # -------------------------------------------------------------------------- # MODEL logger.info('-' * 100) start_epoch = 0 if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.info('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args) else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.info('Using pretrained model...') model = DocReader.load(args.pretrained, args) if args.expand_dictionary: logger.info('Expanding dictionary for new data...') # Add words in training + dev examples words = utils.load_words(args, train_exs + dev_exs) added_words = model.expand_dictionary(words) # Load pretrained embeddings for added words if args.embedding_file: model.load_embeddings(added_words, args.embedding_file) logger.info('Expanding char dictionary for new data...') # Add words in training + dev examples chars = utils.load_chars(args, train_exs + dev_exs) added_chars = model.expand_char_dictionary(chars) # Load pretrained embeddings for added words if args.char_embedding_file: model.load_char_embeddings(added_chars, args.char_embedding_file) else: logger.info('Training model from scratch...') model = init_from_scratch(args, train_exs, dev_exs) # Set up partial tuning of embeddings if args.tune_partial > 0: logger.info('-' * 100) logger.info('Counting %d most frequent question words' % args.tune_partial) top_words = utils.top_question_words(args, train_exs, model.word_dict) for word in top_words[:5]: logger.info(word) logger.info('...') for word in top_words[-6:-1]: logger.info(word) model.tune_embeddings([w[0] for w in top_words]) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') train_dataset = data.ReaderDataset(train_exs, model, single_answer=True) if args.sort_by_len: train_sampler = data.SortedBatchSampler(train_dataset.lengths(), args.batch_size, shuffle=True) else: train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=vector.batchify, pin_memory=args.cuda, ) dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False) if args.sort_by_len: dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(), args.test_batch_size, shuffle=False) else: dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=vector.batchify, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG logger.info('-' * 100) logger.info('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} model_prefix = os.path.join(args.model_dir, args.model_name) kept_models = [] best_model_path = '' for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # Train train(args, train_loader, model, stats) # Validate unofficial (train) logger.info('eval: train split unofficially...') validate_unofficial(args, train_loader, model, stats, mode='train') if args.official_eval: # Validate official (dev) logger.info('eval: dev split unofficially..') result = validate_official(args, dev_loader, model, stats, dev_offsets, dev_texts, dev_answers) else: # Validate unofficial (dev) logger.info( 'train: evaluating dev split evaluating dev official...') result = validate_unofficial(args, dev_loader, model, stats, mode='dev') em = result['exact_match'] f1 = result['f1'] suffix = 'em_{:4.2f}-f1_{:4.2f}.mdl'.format(em, f1) # Save best valid model_file = '{}-epoch_{}-{}'.format(model_prefix, epoch, suffix) if args.valid_metric: if result[args.valid_metric] > stats['best_valid']: for f in glob.glob('{}-best*'.format(model_prefix)): os.remove(f) logger.info('eval: dev best %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model_file = '{}-best-epoch_{}-{}'.format( model_prefix, epoch, suffix) best_model_path = model_file model.save(model_file) stats['best_valid'] = result[args.valid_metric] for f in kept_models: os.remove(f) kept_models.clear() else: model.save(model_file) kept_models.append(model_file) if len(kept_models) >= args.early_stop: logger.info( 'Finished training due to %s not improved for %d epochs, best model is at: %s' % (args.valid_metric, args.early_stop, best_model_path)) return else: # just save model every epoch since no validation metric is given model.save(model_file)
from onehot import OneHotEncoder from tensorflow.keras.models import load_model from utils import load_text, tokenize, get_padded_token, load_s2s_model, decode_sequences, prepare_word_tokens, get_batch_generator from model import s2s_model HIDDEN_SIZE = 512 ERR_RATE = 0.2 BATCH_SIZE = 256 DATA_DIR = './data' if __name__ == '__main__': encoder, decoder = load_s2s_model( 'test-no_reverse-hs-512_err-0.8_bs-256_e-100_drop-0.2.h5', HIDDEN_SIZE) text = load_text(DATA_DIR) test_text = load_text(DATA_DIR, 'test') word_set = list(filter(None, set(tokenize(text)))) test_word_set = list(filter(None, set(tokenize(test_text)))) train_max_word_len = max([len(token) for token in word_set]) + 2 test_max_word_len = max([len(token) for token in test_word_set]) + 2 train_enc_tokens, train_dec_tokens, _ = prepare_word_tokens( word_set, train_max_word_len, ERR_RATE) test_enc_tokens, test_dec_tokens, test_target_tokens = prepare_word_tokens( test_word_set, test_max_word_len, ERR_RATE) enc_charset = set(' '.join(train_enc_tokens)) dec_charset = set(' '.join(train_dec_tokens)) enc_oh = OneHotEncoder(enc_charset) dec_oh = OneHotEncoder(dec_charset)
ap.add_argument("--batch_size", type=int, default=64, help="size of mini-batch") ap.add_argument("--seq_length", type=int, default=100, help="numbers of time-steps in sequence") ap.add_argument("--learning_rate", type=float, default=0.001, help="learning rate") ap.add_argument("--embed_size", type=int, default=300, help="number of dimensions in word embeddings") ap.add_argument("--lstm_size", type=int, default=512, help="number of units in lstm") ap.add_argument("--lstm_layers", type=int, default=1, help="number of layers in lstm network") ap.add_argument("--temperature", type=float, default=1.0, help="higher value means more random words will be picked and lower value means less randomness") ap.add_argument("--dropout", type=float, default=0.3, help="dropout rate") ap.add_argument("--resume", action="store_true", help="resume training from last checkpoint") ap.add_argument("--word2vec", action="store_true", help="train word2vec embeddings on data and use for training instead of doing it from scratch") args = ap.parse_args() model = TranscriptNet(args) if args.mode == "train": text = utils.load_text(args.data_path) model.train(text) else: model.generate()