def check_haiku(sentence): """ Checks whether is a sentence fits the (simplified) criteria for a haiku. """ first, second, third = 0, 0, 0 first_line = [] second_line = [] third_line = [] for token in sentence: syllables = count_syllables(token) if first < 5: first += syllables first_line.append(token) elif second < 7: if syllables == 0: first_line.append(token) else: second += syllables second_line.append(token) else: if syllables == 0: first_line.append(token) else: third += syllables third_line.append(token) if any([first != 5, second != 7, third != 5]): return None first_line = detokenize(first_line) second_line = detokenize(second_line) third_line = detokenize(third_line) return first_line, second_line, third_line
def decoding(loaded_model, test_dataset, arg_parser): beam_size = arg_parser.beam_size max_len = arg_parser.max_decode_len decoding_method = loaded_model.beam_search if arg_parser.decoding == 'beam_search' else loaded_model.decode_greedy loaded_model.eval() model_outputs = [] model_outputs_kb = [] gold_queries_kb = [] gold_queries = [] with torch.no_grad(): for src_sent_batch, gold_target in tqdm(data_iterator(test_dataset, batch_size=1, shuffle=False), total=280): example_hyps = decoding_method(src_sent=src_sent_batch, max_len=max_len, beam_size=beam_size) strings_model = [ detokenize(example_hyp) for example_hyp in example_hyps ] string_gold = gold_target[0] model_outputs_kb.append(strings_model) gold_queries_kb.append(string_gold) strings_model, string_gold = format_lf(strings_model, string_gold) model_outputs.append(strings_model) gold_queries.append(string_gold) return model_outputs, gold_queries, model_outputs_kb, gold_queries_kb
def main(): vocab = Vocab('bert-base-uncased') test_inputs = get_dataset_finish_by('geoQueryData', 'train', '_recomb.tsv') with open('tokenization_tests.txt', 'w') as test_file: test_file.truncate() num_matches = 0 total_examples = 0 for batch_idx, batch_examples in enumerate( data_iterator(test_inputs, batch_size=1, shuffle=False)): tokens_list = vocab.to_input_tokens(batch_examples[1])[0] detokenized = detokenize(tokens_list) if detokenized == batch_examples[1][0]: num_matches += 1 else: test_file.write('wrong example:\n') test_file.write(batch_examples[1][0] + '\n') test_file.write(detokenized + '\n') test_file.write('\n' + '-' * 15 + '\n') total_examples += 1 print( f"we obtained the following result: {num_matches / total_examples:.2f} accuracy for detokenization method on given dataset" ) return
def predictions2json(dataset, intents_pred, slots_pred, outfile): with open(outfile, 'w') as f: root = {} idx = 0 for stc, intent, slot in zip(dataset.stcs_literals, intents_pred, slots_pred): entry = {} entry['intent'] = dataset.intent_converter.id2T(intent) # entry ['text'] = stc previd = -1 val = '' slot_entry = {} for stc_idx, i in enumerate(slot): key = dataset.slots_converter.id2T(i) if key != '-': if previd == i: slot_entry[key] += stc[stc_idx] + ' ' else: previd = i slot_entry[key] = stc[stc_idx] + ' ' for key, val in slot_entry.items(): stoks = slot_entry[key].rstrip() slot_entry[key] = detokenize(stoks.split()) entry['slots'] = slot_entry root[str(idx)] = entry idx += 1 json.dump(root, f, indent=3, ensure_ascii=False)
root.setLevel(logging.DEBUG) dictionary, rev_dict = utils.get_dictionary(args.text) num_classes = len(dictionary) iterator = utils.tokenize(args.text, dictionary, batch_size=args.batch_size, seq_len=args.seq_len) sess = tf.Session() model = SeqGAN(sess, num_classes, logdir=args.logdir, learn_phase=args.learn_phase, only_cpu=args.only_cpu) model.build() model.load(ignore_missing=True) for epoch in xrange(1, args.num_epochs + 1): for step in xrange(1, args.num_steps + 1): logging.info('epoch %d, step %d', epoch, step) model.train_batch(iterator.next()) # Generates a sample from the model. g = model.generate(1000) print(utils.detokenize(g, rev_dict)) # Saves the model to the logdir. model.save()
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Sample from a trained SeqGAN model.') parser.add_argument('sample_len', metavar='N', type=int, help='length of sample to generate') parser.add_argument('-t', '--dictionary', default='dictionary.pkl', type=str, help='path to dictionary file') parser.add_argument('-d', '--logdir', default='model/', type=str, help='directory of the trained model') parser.add_argument('-c', '--only_cpu', default=True, action='store_true', help='if set, only build weights on cpu') args = parser.parse_args() if not os.path.exists(args.dictionary): raise ValueError('No dictionary file found: "%s". To build it, ' 'run train.py' % args.dictionary) _, rev_dict = utils.get_dictionary(None, dfile=args.dictionary) num_classes = len(rev_dict) sess = tf.Session() model = SeqGAN(sess, num_classes, logdir=args.logdir, only_cpu=args.only_cpu) model.build() model.load(ignore_missing=True) g = model.generate(args.sample_len) print('Generated text:', utils.detokenize(g, rev_dict))
args = parser.parse_args() # Turns on logging. import logging root = logging.getLogger() root.setLevel(logging.DEBUG) dictionary, rev_dict = utils.get_dictionary(args.text) num_classes = len(dictionary) iterator = utils.tokenize(args.text, dictionary, batch_size=args.batch_size, seq_len=args.seq_len) sess = tf.Session() model = SeqGAN(sess, num_classes, only_cpu=args.only_cpu) model.build() for epoch in xrange(args.num_epochs): for step in xrange(args.num_steps): logging.info('epoch %d, step %d', epoch, step) model.train_batch(iterator.next()) # Generates a sample from the model. g = model.generate(100) logging.info('Epoch %d: "%s"', epoch, utils.detokenize(g, rev_dict)) # Saves the model to the logdir. model.save()
# Change to image dir because textogif doesn't seem to work otherwise... oldcwd = os.getcwd() with open(sys.argv[1]) as fim2latex: images = {} for line in fim2latex: if len(line.strip()) > 0: tabular_id, image, mode = line.strip().split() tabular_id = int(tabular_id) images[tabular_id] = image renders = [] with open(sys.argv[3]) as ftabulars: for tabular_id, line in enumerate(ftabulars): if len(line.strip()) > 0: tokens = line.strip().split(' ') line_out = detokenize(tokens) renders.append((images[tabular_id], line_out)) # Check we are not in image dir yet (avoid exceptions) if not tabular_images_validate in os.getcwd(): os.chdir(tabular_images_validate) pool = Pool(THREADS) pool.map(tabular_to_image, renders) os.chdir(oldcwd) pool = Pool(THREADS) total_match = 0 num_total = 0 for match in pool.imap( calc_match,