Пример #1
0
def train_main(args):
    """
    trains model specfied in args.
    main method for train subcommand.
    """
    # load text
    text = load_text(args.text_path)

    if args.test_path:
        test_text = load_text(args.test_path)
    else:
        test_text = None

    # load or build model
    if args.restore:
        load_path = args.checkpoint_path if args.restore is True else args.restore
        model = load_model(load_path)
        logger.info("model restored: %s.", load_path)
    else:
        model = build_model(batch_size=args.batch_size,
                            seq_len=args.seq_len,
                            vocab_size=get_VOCAB_SIZE(),
                            embedding_size=args.embedding_size,
                            rnn_size=args.rnn_size,
                            num_layers=args.num_layers,
                            drop_rate=args.drop_rate,
                            learning_rate=args.learning_rate,
                            clip_norm=args.clip_norm)

    # make and clear checkpoint directory
    log_dir = make_dirs(args.checkpoint_path, empty=True)
    model.save(args.checkpoint_path)
    logger.info("model saved: %s.", args.checkpoint_path)
    # callbacks
    callbacks = [
        ModelCheckpoint(args.checkpoint_path, verbose=1, save_best_only=False),
        TensorBoard(log_dir,
                    write_graph=True,
                    embeddings_freq=1,
                    embeddings_metadata={
                        "embedding_1":
                        os.path.abspath(os.path.join("data", "id2char.tsv"))
                    }),
        LoggerCallback(text, test_text, model, args.checkpoint_path)
    ]

    # training start
    num_batches = (len(text) - 1) // (args.batch_size * args.seq_len)
    model.reset_states()
    model.fit_generator(batch_generator(encode_text(text, get_CHAR2ID()),
                                        args.batch_size,
                                        args.seq_len,
                                        one_hot_labels=True),
                        num_batches,
                        args.num_epochs,
                        callbacks=callbacks)
    return model
Пример #2
0
def preprocess():
    """
    Prepare data 
    """
    data = FLAGS.dataset
    text = load_text(data)
    vocab = sorted(set(text))

    # Creating a mapping from unique characters to indices
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    text_as_int = np.array([char2idx[c] for c in text])

    # The maximum length sentence we want for a single input in characters
    seq_length = 150
    examples_per_epoch = len(text)//(seq_length+1)

    # Create training examples / targets
    char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

    # The batch method lets us easily convert these individual characters to sequences of the desired size.
    sequences = char_dataset.batch(seq_length+1, drop_remainder=FLAGS.drop_remainder)

    # For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch:
    dataset = sequences.map(split_input_target)

    # shuffle the data and pack it into batches.
    dataset = dataset.shuffle(FLAGS.buffer_size).batch(FLAGS.batch_size, drop_remainder=FLAGS.drop_remainder)
    
    return vocab, dataset
Пример #3
0
def test_main(args):
    test_text = load_text(args.test_path)
    model = retrieve_model(args)
    model.reset_states()

    bpc = calculate_bpc(model, test_text)
    print(bpc)
def generate_RI_text_fast(N, RI_letters, cluster_sz, ordered, text_name, alph=alphabet):
	text_vector = np.zeros((1, N))
	text = utils.load_text(text_name)
	cluster2 = ''
	vector = np.ones((1,N))
	for char_num in xrange(len(text)):		
		cluster = cluster + text[char_num]
		if len(cluster) < cluster_sz:
			continue
		elif len(cluster) > cluster_sz:
			prev_letter = cluster[0]
			prev_letter_idx = alphabet.find(letter)
			inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1)
			vector = np.multiply(vector, inverse)
			vector = np.roll(vector, 1)
			letter = text[char_num]
			letter_idx = alphabet.find(letter)
			vector = np.multiply(vector, RI_letters[letter_idx,:])
			cluster = cluster[1:]
		else: # (len(cluster) == cluster_size), happens once
			letters = list(cluster)
			for letter in letters:
				vector = np.roll(vector,1)
				letter_idx = alphabet.find(letter)
				vector = np.multiply(vector, RI_letters[letter_idx,:])
		text_vector += vector
	return text_vector
def generate_RI_text_fast(N, RI_letters, cluster_sz, ordered, text_name, alph=alphabet):
	text_vector = np.zeros((1, N))
	text = utils.load_text(text_name)
	cluster2 = ''
	vector = np.ones((1,N))
	for char_num in xrange(len(text)):		
		cluster = cluster + text[char_num]
		if len(cluster) < cluster_sz:
			continue
		elif len(cluster) > cluster_sz:
			prev_letter = cluster[0]
			prev_letter_idx = alphabet.find(letter)
			inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1)
			vector = np.multiply(vector, inverse)
			vector = np.roll(vector, 1)
			letter = text[char_num]
			letter_idx = alphabet.find(letter)
			vector = np.multiply(vector, RI_letters[letter_idx,:])
			cluster = cluster[1:]
		else: # (len(cluster) == cluster_size), happens once
			letters = list(cluster)
			for letter in letters:
				vector = np.roll(vector,1)
				letter_idx = alphabet.find(letter)
				vector = np.multiply(vector, RI_letters[letter_idx,:])
		text_vector += vector
	return text_vector
Пример #6
0
	def __init__(self, master, prev_sc, main_bg):
		# 1. Initilising GUI Components
		self.update_screen(master,main_bg)
		self.update_variables(prev_sc)
		
		self.start_log = 		"---------------------------------\n" + \
								"| LOG STAGE 2 START SCREEN      |\n" + \
								"---------------------------------"
		self.start_txt = 		"| Start Action Button Pressed   |"
		self.exit_txt = 		"| Exit Button Pressed           |"
		print(self.start_log)

		# 2. Setting the Screen Components
		self.title = tkinter.Label(master, bg='white',\
									 fg = 'black', text='FASE 2', font=Font(family='Helvetica', size=30, weight='bold'))
		self.title.place(x=self.sw/2,y=2*self.sh/10,anchor='center')

		# a. Start Button
		self.start_button = Button(master, anchor = 'center', compound = 'center', 
									text = 'JOGAR',font = Font(family='Helvetica', size=28, weight='bold'),
									bg = "#%02x%02x%02x" % (30, 30, 30), fg = 'white',
									command = self.start_button_click,
									highlightthickness = 0,
									bd = 0, padx=0,pady=0,height=2,width=13)
		self.start_button.place(x = self.sw/2, y = 8*self.sh/10, anchor= 'center')

		# b. Stage 1 Text
		text = utils.load_text(2)
		self.text_display = scrolledtext.ScrolledText(master, fg = 'black', font = Font(family='Helvetica', size=18),\
									 bg = "#%02x%02x%02x" % (255, 255, 255), insertbackground = 'black',\
									 highlightcolor = "#%02x%02x%02x" % (180,180,180), highlightbackground= "#%02x%02x%02x" % (50,50,50),\
									  bd=0, width =47, height=10, padx=10, pady=10, wrap='word')
		self.text_display.insert('insert',text)
		self.text_display.configure(state='disabled')
		self.text_display.place(x=self.sw/2,y=self.sh/2,anchor='center')
def bio2typing(res_dir, test_fids, tag=0):
    res = non_integrated_results(res_dir, test_fids)
    merged_entities = extract_entities(res, test_fids)
    ner_typing_root = Path(NER_TYPING_ROOT)
    ner_typing_root.mkdir(parents=True, exist_ok=True)
    pkl_save(merged_entities, ner_typing_root / f"merged_entities_{tag}.pkl")
    for test_fid in test_fids:
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{test_fid}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{test_fid}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)

        ens = merged_entities[test_fid]
        fm, ls, ob = [], [], []
        for en_idx, en in enumerate(ens):
            # ('son', 'FAMILYMEMBER', (334, 337), (342, 345))
            en_span = en[-1]
            en_type = en[1].lower()
            sidx = get_sent_idx(en_span, sent_bound)
            en_loc_sent = sents[sidx]
            pure_text = pre_txt[sidx]
            tagged_sent = insert_token_and_creat_text_for_testing(
                en_loc_sent, en_span)

            if valida_by_sent(tagged_sent):
                print(test_fid, en, tagged_sent)

            if en_type == "familymember":
                fm.append([
                    f"{test_fid}@{en_idx}", f"{test_fid}@{en_idx}", pure_text,
                    tagged_sent
                ])
            elif en_type == "observation":
                ob.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            elif en_type == "livingstatus":
                ls.append([f"{test_fid}@{en_idx}", pure_text, tagged_sent])
            else:
                raise RuntimeError(f"{en_type} is not recognized for {en}")

        # # fms, fmr share the same dir
        pfm = Path(FMS_TEST.format(tag))
        pfm.mkdir(exist_ok=True, parents=True)
        to_tsv(fm, pfm / "test.tsv")

        pfo = Path(OBN_TEST.format(tag))
        pfo.mkdir(exist_ok=True, parents=True)
        to_tsv(ob, pfo / "test.tsv")

        pfl = Path(LSS_TEST.format(tag))
        pfl.mkdir(exist_ok=True, parents=True)
        to_tsv(ls, pfl / "test.tsv")

        pkl_save(fm, ner_typing_root / f"fm_{tag}.pkl")
        pkl_save(ob, ner_typing_root / f"ob_{tag}.pkl")
        pkl_save(ls, ner_typing_root / f"ls_{tag}.pkl")
def bio2relation():
    TAG = "pred"
    sdiff = []
    relation_types = []
    pred_relations_plan1 = []
    pred_relations_plan2 = []
    mapping = []

    typed_entities = pkl_load(Path(NER_TYPING_ROOT) / "typed_entities_ens.pkl")
    for doc_id, ens in typed_entities.items():
        pre_txt = load_text(
            Path(PREPROCESSED_TEXT_DIR) /
            f"{doc_id}.preprocessed.txt").split("\n")
        sents = pkl_load(Path(TEXT_AS_SENT_DIR) / f"{doc_id}.sents.pkl")
        sent_bound = creat_sent_altered_boundary(sents)
        enids = range(len(ens))
        all_pairs = []
        for e1, e2 in permutations(enids, 2):
            all_pairs.append((e1, e2))

        for each in all_pairs:
            eid1, eid2 = each
            # (('son', 'FAMILYMEMBER', (334, 337), (342, 345)), ['NA', 'Son'], 'FAMILYMEMBER')
            en1 = ens[eid1]
            en2 = ens[eid2]
            if en1[-1].upper() != "FAMILYMEMBER" or en2[-1].upper(
            ) == "FAMILYMEMBER":
                continue

            sie1 = get_sent_idx(en1[0][3], sent_bound)
            sie2 = get_sent_idx(en2[0][3], sent_bound)
            if abs(sie1 - sie2) > GLOBAL_CUTOFF:
                continue

            bert_rels = insert_tags_for_relation(sents[sie1], sents[sie2],
                                                 en1[0], en2[0])
            tagged_s1, tagged_s2, pure_text1, pure_text2 = bert_rels
            pred_relations_plan1.append([
                TAG, tagged_s1, tagged_s2, pure_text1, pure_text2,
                f"{abs(sie1 - sie2)}",
                str()
            ])
            tp = generate_bert_relation_without_extra_sentence(
                sents[sie1], sents[sie2], en1[0], en2[0], sents, sie1, sie2)
            pred_relations_plan2.append([TAG, tp, f"{abs(sie1 - sie2)}"])
            mapping.append((doc_id, en1, en2))

    prel = Path(REL_TEST)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan2, prel / "test.tsv")

    prel = Path(REL_TESTa)
    prel.mkdir(parents=True, exist_ok=True)
    pkl_save(mapping, prel / "relation_mappings.tsv")
    to_tsv(pred_relations_plan1, prel / "test.tsv")
Пример #9
0
    def train(self, training_directory):
        from utils import load_text

        text = load_text(training_directory)
        sentences = self._preprocess(text)

        # what if the same word appears twice in one sentence? do we discount that?
        for sentence in sentences:
            for word in sentence:
                self._update_mapping(word, sentence)
 def match(self, x):
     input_text = [
         load_text(i, self.max_sentence_len, self.input2idx, self.choice)
         for i in x
     ]
     input_text = np.asarray(input_text)
     res = self.model.predict(input_text)
     res = concat(res)
     res = self.decode(res, True)
     return res
def non_integrated_results(root, test_fids, original_offset_only=False):
    p = Path(root)
    ll = len(test_fids)
    result = dict()
    for fid in p.glob("*.txt"):
        fid_stem = fid.stem.split(".")[0]
        assert fid_stem in test_fids, f"{fid.stem} is not a test fid"
        ll -= 1
        cont = load_text(fid)
        sents = text2sents(cont.strip(), original_offset_only)
        result[fid_stem] = sents
    assert ll == 0, f"missing {ll} prediction files"
    return result
def generate_RI_text_history(N, RI_letters, text_name, alph=alphabet):
		# generate RI vector for "text_name"
		# assumes text_name has .txt

		text_vector = np.zeros((1, N))
		history_vector = np.zeros((1,N))
		text = utils.load_text(text_name)
		for char_num in xrange(len(text)):
			char = text[char_num]
			letter_idx = alphabet.find(char)
			history_vector = 0.75*history_vector + RI_letters[letter_idx,:]
			text_vector += history_vector	
				
		return text_vector
def generate_RI_text_history(N, RI_letters, text_name, alph=alphabet):
		# generate RI vector for "text_name"
		# assumes text_name has .txt

		text_vector = np.zeros((1, N))
		history_vector = np.zeros((1,N))
		text = utils.load_text(text_name)
		for char_num in xrange(len(text)):
			char = text[char_num]
			letter_idx = alphabet.find(char)
			history_vector = 0.75*history_vector + RI_letters[letter_idx,:]
			text_vector += history_vector	
				
		return text_vector
Пример #14
0
def generate_text_vector(N, RI_letters, cluster_sz, text_name):
    text_vector = np.zeros((1,N))

    text = utils.load_text(text_name)

    for char_idx in xrange(len(text)-cluster_sz+1):
        sidx = char_idx
        eidx = char_idx+cluster_sz
        
        cluster = text[sidx:eidx]
        
        vector = np.ones((1,N))
        for letter in cluster:
            letter_idx = alphabet.find(letter)
            vector = np.roll(vector, 1)
            vector = np.multiply(vector, RI_letters[letter_idx, :])
            
        text_vector += vector
    return text_vector / (len(text)-cluster_sz+1)
Пример #15
0
def generate_vocab_lang_vectors(N, RI_letters, cluster_sz, ordered, text_name, min_, max_,alph=alphabet):
    text_vector = np.zeros((1, N))
    vocab_vec = np.zeros((1,N))
    text = utils.load_text(text_name)
    cluster = ''
    vector = np.ones((1,N))
    for char_num in xrange(len(text)):      
        cluster = cluster + text[char_num]
        if len(cluster) < cluster_sz:
            continue
        elif len(cluster) > cluster_sz:
            prev_letter = cluster[0]
            prev_letter_idx = alphabet.find(prev_letter)
            inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1)
            vector = np.multiply(vector, inverse)
            vector = np.roll(vector, 1)
            letter = text[char_num]
            letter_idx = alphabet.find(letter)
            vector = np.multiply(vector, RI_letters[letter_idx,:])
            cluster = cluster[1:]
        else: # (len(cluster) == cluster_size), happens once
            letters = list(cluster)
            for letter in letters:
                vector = np.roll(vector,1)
                letter_idx = alphabet.find(letter)
                vector = np.multiply(vector, RI_letters[letter_idx,:])

        if(np.dot(vocab_vec, vector.T)<-10000):
            text_vector += vector;

        else:
            vocab_vec+= vector;
            if(np.dot(vocab_vec, vector.T) > min_):
                while(True):
                    if(np.dot(vocab_vec, vector.T)<-10000):
                        break;
                    else:
                        vocab_vec -= vector;


    return text_vector, vocab_vec
Пример #16
0
def generate_vocab_lang_vectors(N, RI_letters, cluster_sz, ordered, text_name, min_, max_,alph=alphabet):
    text_vector = np.zeros((1, N))
    vocab_vec = np.zeros((1,N))
    text = utils.load_text(text_name)
    cluster = ''
    vector = np.ones((1,N))
    for char_num in xrange(len(text)):      
        cluster = cluster + text[char_num]
        if len(cluster) < cluster_sz:
            continue
        elif len(cluster) > cluster_sz:
            prev_letter = cluster[0]
            prev_letter_idx = alphabet.find(prev_letter)
            inverse = np.roll(RI_letters[prev_letter_idx,:], cluster_sz-1)
            vector = np.multiply(vector, inverse)
            vector = np.roll(vector, 1)
            letter = text[char_num]
            letter_idx = alphabet.find(letter)
            vector = np.multiply(vector, RI_letters[letter_idx,:])
            cluster = cluster[1:]
        else: # (len(cluster) == cluster_size), happens once
            letters = list(cluster)
            for letter in letters:
                vector = np.roll(vector,1)
                letter_idx = alphabet.find(letter)
                vector = np.multiply(vector, RI_letters[letter_idx,:])

        if(np.dot(vocab_vec, vector.T)<-10000):
            text_vector += vector;

        else:
            vocab_vec+= vector;
            if(np.dot(vocab_vec, vector.T) > min_):
                while(True):
                    if(np.dot(vocab_vec, vector.T)<-10000):
                        break;
                    else:
                        vocab_vec -= vector;


    return text_vector, vocab_vec
Пример #17
0
Файл: train.py Проект: xllg/Ernn
def main(args):
    # --------------------------------------------------------
    # Data
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)
    # --------------------------------------------------------
    # Model
    logger.info('-' * 100)
    start_epoch = 0
    logger.info('Training model from scratch...')
    model = init_from_scratch(args, train_exs, dev_exs)

    # Set up partial tuning of embeddings
    if args.tune_partial > 0:
        logger.info('-' * 100)
        logger.info('Counting %d most frequent question words' %
                    args.tune_partial)
        top_words = utils.top_question_words(args, train_exs, model.word_dict)
        for word in top_words[:5]:
            logger.info(word)
        logger.info('...')
        for word in top_words[-6:-1]:
            logger.info(word)
        model.tune_embeddings([w[0] for w in top_words])

    # Set up optimizer
    model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')
    train_dataset = data.ReaderDataset(train_exs, model, single_answer=True)
    if args.sort_by_len:
        train_sampler = data.SortedBatchSampler(
            train_dataset.lengths(), args.batch_size,
            shuffle=True)  # shuffle设置为true每个batch重新打乱顺序
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,  # 每个batch加载多少个样本
        sampler=train_sampler,  # 从数据集中提取样本的策略,如果指定,则忽略shuffle参数
        num_workers=args.data_workers,  # 用多少个子进程加载数据
        collate_fn=vector.batchify,  # 合并样本形成小批量
        pin_memory=args.cuda,  # 如果为true,那么数据加载器将张量复制到cuda固定的内存中,然后再返回
    )
    dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False)
    if args.sort_by_len:
        dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.batchify,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # train
        train(args, train_loader, model, stats)

        # Validate unofficial (train)
        validate_unofficial(args, train_loader, model, stats, mode='train')

        # Validate unofficial (dev)
        result = validate_unofficial(args,
                                     dev_loader,
                                     model,
                                     stats,
                                     mode='dev')

        # Validate official
        if args.official_eval:
            result = validate_official(args, dev_loader, model, stats,
                                       dev_offsets, dev_texts, dev_answers)

        # Save best valid
        if result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]
Пример #18
0
from tensorflow.keras import callbacks
from utils import get_batch_generator, get_data_generator
from utils import load_text, tokenize, prepare_word_tokens
from onehot import OneHotEncoder
from model import s2s_model

HIDDEN_SIZE = 512
ERR_RATE = 0.2
EPOCHS = 15
BATCH_SIZE = 256
DATA_DIR = './data'
R_DROPUT = 0.2

if __name__ == '__main__':
    train_text = load_text(DATA_DIR)
    val_text = load_text(DATA_DIR, 'val')

    train_word_set = list(filter(None, set(tokenize(train_text))))
    val_word_set = list(filter(None, set(tokenize(val_text))))

    train_max_word_len = max([len(token) for token in train_word_set]) + 2
    val_max_word_len = max([len(token) for token in val_word_set]) + 2

    train_encoder_tokens, train_decoder_tokens, train_target_tokens = prepare_word_tokens(
        train_word_set, train_max_word_len, error_rate=ERR_RATE)
    val_encoder_tokens, val_decoder_tokens, val_target_tokens = prepare_word_tokens(
        val_word_set, val_max_word_len, error_rate=ERR_RATE)

    input_charset = set(' '.join(train_encoder_tokens))
    target_charset = set(' '.join(train_decoder_tokens))
Пример #19
0
    i = 1
    while i != sentence_length:
        next_word = stochastic_sample(markov, word)
        if next_word == '<STOP>' or next_word == '<START>':
            next_word = stochastic_sample(markov, '<START>')
        sentence.append(next_word)

        word = next_word
        i += 1

    while i == sentence_length:
        next_word = stochastic_sample(markov, word)
        if next_word == '<START>':
            next_word = stochastic_sample(markov, '<START>')
        if next_word == '<STOP>':
            break
        sentence.append(next_word)
        word = next_word

    return " ".join(sentence)


if __name__ == "__main__":
    file = "corpus_data/cleaned/SS_TOKEN_complete.txt"
    corpus = load_text(file)
    markov = markov_histo(corpus)

    for i in range(10):
        walk = random_walk(markov, 10)
        print(walk)
Пример #20
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = ParagraphRanker.load_checkpoint(checkpoint_file, 
                                                             args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = ParagraphRanker.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added, args.embedding_file, args.fasttext)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(
                args, train_exs, model.word_dict
            )
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')
    train_dataset = data.RankerDataset(train_exs, model, 
                                       args.neg_size, args.allowed_size)
    if args.sort_by_len:
        train_sampler = data.RankerBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.ranker_train_batchify,
        pin_memory=args.cuda,
    )
    dev_dataset = data.RankerDataset(dev_exs, model,
                                     neg_size=1, allowed_size=1000)
    if args.sort_by_len:
        dev_sampler = data.RankerBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.ranker_dev_batchify,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Filtering by questions
        # pre_selected_docs = filter_docs(args, dev_loader)

        # Encode documents for dev
        docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev')

        # Rank encoded documents
        result = rank_docs(args, docs, qs, stats, mode='dev')

        # Save best valid
        if result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.3f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]

    # Ranker final evaluation
    docs, qs = encode_docs_qs(args, dev_loader, model, stats, mode='dev')
    result = rank_docs(args, docs, qs, stats, mode='dev')
Пример #21
0
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, CuDNNLSTM
from keras.layers import Lambda, Add
from keras.models import Model
import keras
from sklearn.model_selection import train_test_split
from LossHistory import LossHistory
from keras.utils import np_utils
import random

embeddings_index = load_embedding()

use_text_length = 20000
japanese_text = load_text(use_text_length)
split_japanese_text = mecab_to_text(japanese_text)
dictionary = make_word_dictionary(split_japanese_text, lower_bound=10)
dictionary = clear_dictionary(dictionary, embeddings_index)

#all_embs = np.stack(embeddings_index.values())
#emb_mean, emb_std = all_embs.mean(), all_embs.std()

## Tokenize the sentences
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(dictionary)
"""
with open("..\result\tokenizer.pkl", 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
"""
Пример #22
0
from tensorflow.keras.models import load_model

from utils import load_text, tokenize, get_padded_token, load_s2s_model, decode_sequences, prepare_word_tokens
from model import s2s_model

HIDDEN_SIZE = 512
ERR_RATE = 0.8
BATCH_SIZE = 256
DATA_DIR = './data'

if __name__ == '__main__':
    # Prepare model
    encoder, decoder = load_s2s_model(
        'test-no_reverse-hs-512_err-0.8_bs-256_e-30_drop-0.2.h5', HIDDEN_SIZE)

    text = load_text(DATA_DIR)
    word_set = list(filter(None, set(tokenize(text))))
    max_word_len = max([len(token) for token in word_set]) + 2
    train_enc_tokens, train_dec_tokens, _ = prepare_word_tokens(
        word_set, max_word_len, ERR_RATE)

    enc_charset = set(' '.join(train_enc_tokens))
    dec_charset = set(' '.join(train_dec_tokens))
    enc_oh = OneHotEncoder(enc_charset)
    dec_oh = OneHotEncoder(dec_charset)

    # Input decoding loop
    while True:
        sentence = input('\nEnter sentence to decode:\n')
        tokens = list(filter(None, tokenize(sentence)))
        nb_of_tokens = len(tokens)
Пример #23
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added_words = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added_words, args.embedding_file)

                logger.info('Expanding char dictionary for new data...')
                # Add words in training + dev examples
                chars = utils.load_chars(args, train_exs + dev_exs)
                added_chars = model.expand_char_dictionary(chars)
                # Load pretrained embeddings for added words
                if args.char_embedding_file:
                    model.load_char_embeddings(added_chars, args.char_embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(
                args, train_exs, model.word_dict
            )
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')

    train_dataset = data.ReaderDataset(train_exs, model, single_answer=True)
    if args.sort_by_len:
        train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    # if args.use_sentence_selector:
    #     train_batcher = vector.sentence_batchifier(model, single_answer=True)
    #     batching_function = train_batcher.batchify
    # else:
    batching_function = vector.batchify
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=batching_function,
        pin_memory=args.cuda,
    )
    dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False)
    if args.sort_by_len:
        dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    # if args.use_sentence_selector:
    #     dev_batcher = vector.sentence_batchifier(model, single_answer=False)
    #     batching_function = dev_batcher.batchify
    # else:
    batching_function = vector.batchify
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=batching_function,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}

    # --------------------------------------------------------------------------
    # QUICKLY VALIDATE ON PRETRAINED MODEL

    if args.global_mode == "test":
        result1 = validate_unofficial(args, dev_loader, model, stats, mode='dev')
        result2 = validate_official(args, dev_loader, model, stats,
                                    dev_offsets, dev_texts, dev_answers)
        print(result2[args.valid_metric])
        print(result1["exact_match"])

        validate_adversarial(args, model, stats, mode="dev")
        exit(0)


    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Validate unofficial (train)
        validate_unofficial(args, train_loader, model, stats, mode='train')

        # Validate unofficial (dev)
        result = validate_unofficial(args, dev_loader, model, stats, mode='dev')

        # Validate official
        if args.official_eval:
            result = validate_official(args, dev_loader, model, stats,
                                       dev_offsets, dev_texts, dev_answers)

        # Save best valid
        if args.valid_metric is None or args.valid_metric == 'None':
            model.save(args.model_file)
        elif result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]
Пример #24
0
def validate_adversarial(args, model, global_stats, mode="dev"):
    # create dataloader for dev sets, load thier jsons, integrate the function


    for idx, dataset_file in enumerate(args.adv_dev_json):

        predictions = {}

        logger.info("Validating Adversarial Dataset %s" % dataset_file)
        exs = utils.load_data(args, args.adv_dev_file[idx])
        logger.info('Num dev examples = %d' % len(exs))
        ## Create dataloader
        dev_dataset = data.ReaderDataset(exs, model, single_answer=False)
        if args.sort_by_len:
            dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                                  args.test_batch_size,
                                                  shuffle=False)
        else:
            dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
        # if args.use_sentence_selector:
        #     batching_function = vector.batchify_sentences
        # else:
        batching_function = vector.batchify
        dev_loader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=args.test_batch_size,
            sampler=dev_sampler,
            num_workers=args.data_workers,
            collate_fn=batching_function,
            pin_memory=args.cuda,
        )

        texts = utils.load_text(dataset_file)
        offsets = {ex['id']: ex['offsets'] for ex in exs}
        answers = utils.load_answers(dataset_file)

        eval_time = utils.Timer()
        f1 = utils.AverageMeter()
        exact_match = utils.AverageMeter()

        examples = 0
        bad_examples = 0
        for ex in dev_loader:
            ex_id, batch_size = ex[-1], ex[0].size(0)
            chosen_offset = ex[-2]
            pred_s, pred_e, _ = model.predict(ex)

            for i in range(batch_size):
                if pred_s[i][0] >= len(offsets[ex_id[i]]) or pred_e[i][0] >= len(offsets[ex_id[i]]):
                    bad_examples += 1
                    continue
                if args.use_sentence_selector:
                    s_offset = chosen_offset[i][pred_s[i][0]][0]
                    e_offset = chosen_offset[i][pred_e[i][0]][1]
                else:
                    s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
                    e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
                prediction = texts[ex_id[i]][s_offset:e_offset]

                predictions[ex_id[i]] = prediction

                ground_truths = answers[ex_id[i]]
                exact_match.update(utils.metric_max_over_ground_truths(
                    utils.exact_match_score, prediction, ground_truths))
                f1.update(utils.metric_max_over_ground_truths(
                    utils.f1_score, prediction, ground_truths))

            examples += batch_size

        logger.info('dev valid official for dev file %s : Epoch = %d | EM = %.2f | ' %
                    (dataset_file, global_stats['epoch'], exact_match.avg * 100) +
                    'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                    (f1.avg * 100, examples, eval_time.time()))

        orig_f1_score = 0.0
        orig_exact_match_score = 0.0
        adv_f1_scores = {}  # Map from original ID to F1 score
        adv_exact_match_scores = {}  # Map from original ID to exact match score
        adv_ids = {}
        all_ids = set()  # Set of all original IDs
        f1 = exact_match = 0
        dataset = json.load(open(dataset_file))['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                for qa in paragraph['qas']:
                    orig_id = qa['id'].split('-')[0]
                    all_ids.add(orig_id)
                    if qa['id'] not in predictions:
                        message = 'Unanswered question ' + qa['id'] + ' will receive score 0.'
                        # logger.info(message)
                        continue
                    ground_truths = list(map(lambda x: x['text'], qa['answers']))
                    prediction = predictions[qa['id']]
                    cur_exact_match = utils.metric_max_over_ground_truths(utils.exact_match_score,
                                                                    prediction, ground_truths)
                    cur_f1 = utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths)
                    if orig_id == qa['id']:
                        # This is an original example
                        orig_f1_score += cur_f1
                        orig_exact_match_score += cur_exact_match
                        if orig_id not in adv_f1_scores:
                            # Haven't seen adversarial example yet, so use original for adversary
                            adv_ids[orig_id] = orig_id
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
                    else:
                        # This is an adversarial example
                        if (orig_id not in adv_f1_scores or adv_ids[orig_id] == orig_id
                            or adv_f1_scores[orig_id] > cur_f1):
                            # Always override if currently adversary currently using orig_id
                            adv_ids[orig_id] = qa['id']
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
        orig_f1 = 100.0 * orig_f1_score / len(all_ids)
        orig_exact_match = 100.0 * orig_exact_match_score / len(all_ids)
        adv_exact_match = 100.0 * sum(adv_exact_match_scores.values()) / len(all_ids)
        adv_f1 = 100.0 * sum(adv_f1_scores.values()) / len(all_ids)
        logger.info("For the file %s Original Exact Match : %.4f ; Original F1 : : %.4f | "
                    % (dataset_file, orig_exact_match, orig_f1)
                    + "Adversarial Exact Match : %.4f ; Adversarial F1 : : %.4f " % (adv_exact_match, adv_f1))
Пример #25
0
def main(args):
    # --------------------------------------------------------------------------
    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    train_exs = utils.load_data(args, args.train_file, skip_no_answer=True)
    logger.info('Num train examples = %d' % len(train_exs))
    dev_exs = utils.load_data(args, args.dev_file)
    logger.info('Num dev examples = %d' % len(dev_exs))

    # If we are doing offician evals then we need to:
    # 1) Load the original text to retrieve spans from offsets.
    # 2) Load the (multiple) text answers for each question.
    if args.official_eval:
        dev_texts = utils.load_text(args.dev_json)
        dev_offsets = {ex['id']: ex['offsets'] for ex in dev_exs}
        dev_answers = utils.load_answers(args.dev_json)
    else:
        dev_texts = None
        dev_offsets = None
        dev_answers = None

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file, args)
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added_words = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added_words, args.embedding_file)

                logger.info('Expanding char dictionary for new data...')
                # Add words in training + dev examples
                chars = utils.load_chars(args, train_exs + dev_exs)
                added_chars = model.expand_char_dictionary(chars)
                # Load pretrained embeddings for added words
                if args.char_embedding_file:
                    model.load_char_embeddings(added_chars,
                                               args.char_embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_exs, dev_exs)

        # Set up partial tuning of embeddings
        if args.tune_partial > 0:
            logger.info('-' * 100)
            logger.info('Counting %d most frequent question words' %
                        args.tune_partial)
            top_words = utils.top_question_words(args, train_exs,
                                                 model.word_dict)
            for word in top_words[:5]:
                logger.info(word)
            logger.info('...')
            for word in top_words[-6:-1]:
                logger.info(word)
            model.tune_embeddings([w[0] for w in top_words])

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')

    train_dataset = data.ReaderDataset(train_exs, model, single_answer=True)
    if args.sort_by_len:
        train_sampler = data.SortedBatchSampler(train_dataset.lengths(),
                                                args.batch_size,
                                                shuffle=True)
    else:
        train_sampler = torch.utils.data.sampler.RandomSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.batchify,
        pin_memory=args.cuda,
    )
    dev_dataset = data.ReaderDataset(dev_exs, model, single_answer=False)
    if args.sort_by_len:
        dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                              args.test_batch_size,
                                              shuffle=False)
    else:
        dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
    dev_loader = torch.utils.data.DataLoader(
        dev_dataset,
        batch_size=args.test_batch_size,
        sampler=dev_sampler,
        num_workers=args.data_workers,
        collate_fn=vector.batchify,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}
    model_prefix = os.path.join(args.model_dir, args.model_name)

    kept_models = []
    best_model_path = ''
    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        train(args, train_loader, model, stats)

        # Validate unofficial (train)
        logger.info('eval: train split unofficially...')
        validate_unofficial(args, train_loader, model, stats, mode='train')

        if args.official_eval:
            # Validate official (dev)
            logger.info('eval: dev split unofficially..')
            result = validate_official(args, dev_loader, model, stats,
                                       dev_offsets, dev_texts, dev_answers)
        else:
            # Validate unofficial (dev)
            logger.info(
                'train: evaluating dev split evaluating dev official...')
            result = validate_unofficial(args,
                                         dev_loader,
                                         model,
                                         stats,
                                         mode='dev')

        em = result['exact_match']
        f1 = result['f1']
        suffix = 'em_{:4.2f}-f1_{:4.2f}.mdl'.format(em, f1)
        # Save best valid
        model_file = '{}-epoch_{}-{}'.format(model_prefix, epoch, suffix)
        if args.valid_metric:
            if result[args.valid_metric] > stats['best_valid']:
                for f in glob.glob('{}-best*'.format(model_prefix)):
                    os.remove(f)
                logger.info('eval: dev best %s = %.2f (epoch %d, %d updates)' %
                            (args.valid_metric, result[args.valid_metric],
                             stats['epoch'], model.updates))
                model_file = '{}-best-epoch_{}-{}'.format(
                    model_prefix, epoch, suffix)
                best_model_path = model_file
                model.save(model_file)
                stats['best_valid'] = result[args.valid_metric]
                for f in kept_models:
                    os.remove(f)
                kept_models.clear()
            else:
                model.save(model_file)
                kept_models.append(model_file)
                if len(kept_models) >= args.early_stop:
                    logger.info(
                        'Finished training due to %s not improved for %d epochs, best model is at: %s'
                        %
                        (args.valid_metric, args.early_stop, best_model_path))
                    return
        else:
            # just save model every epoch since no validation metric is given
            model.save(model_file)
Пример #26
0
from onehot import OneHotEncoder
from tensorflow.keras.models import load_model

from utils import load_text, tokenize, get_padded_token, load_s2s_model, decode_sequences, prepare_word_tokens, get_batch_generator
from model import s2s_model

HIDDEN_SIZE = 512
ERR_RATE = 0.2
BATCH_SIZE = 256
DATA_DIR = './data'

if __name__ == '__main__':
    encoder, decoder = load_s2s_model(
        'test-no_reverse-hs-512_err-0.8_bs-256_e-100_drop-0.2.h5', HIDDEN_SIZE)

    text = load_text(DATA_DIR)
    test_text = load_text(DATA_DIR, 'test')
    word_set = list(filter(None, set(tokenize(text))))
    test_word_set = list(filter(None, set(tokenize(test_text))))
    train_max_word_len = max([len(token) for token in word_set]) + 2
    test_max_word_len = max([len(token) for token in test_word_set]) + 2
    train_enc_tokens, train_dec_tokens, _ = prepare_word_tokens(
        word_set, train_max_word_len, ERR_RATE)
    test_enc_tokens, test_dec_tokens, test_target_tokens = prepare_word_tokens(
        test_word_set, test_max_word_len, ERR_RATE)

    enc_charset = set(' '.join(train_enc_tokens))
    dec_charset = set(' '.join(train_dec_tokens))

    enc_oh = OneHotEncoder(enc_charset)
    dec_oh = OneHotEncoder(dec_charset)
Пример #27
0
ap.add_argument("--batch_size", type=int, default=64,
                help="size of mini-batch")
ap.add_argument("--seq_length", type=int, default=100,
                help="numbers of time-steps in sequence")
ap.add_argument("--learning_rate", type=float, default=0.001,
                help="learning rate")
ap.add_argument("--embed_size", type=int, default=300,
                help="number of dimensions in word embeddings")
ap.add_argument("--lstm_size", type=int, default=512,
                help="number of units in lstm")
ap.add_argument("--lstm_layers", type=int, default=1,
                help="number of layers in lstm network")
ap.add_argument("--temperature", type=float, default=1.0,
                help="higher value means more random words will be picked and lower value means less randomness")
ap.add_argument("--dropout", type=float, default=0.3,
                help="dropout rate")
ap.add_argument("--resume", action="store_true",
                help="resume training from last checkpoint")
ap.add_argument("--word2vec", action="store_true",
                help="train word2vec embeddings on data and use for training instead of doing it from scratch")

args = ap.parse_args()

model = TranscriptNet(args)

if args.mode == "train":
    text = utils.load_text(args.data_path)
    model.train(text)
else:
    model.generate()