def check_haiku(sentence):
    """
    Checks whether is a sentence fits the (simplified) criteria for a haiku.
    """
    first, second, third = 0, 0, 0
    first_line = []
    second_line = []
    third_line = []
    for token in sentence:
        syllables = count_syllables(token)
        if first < 5:
            first += syllables
            first_line.append(token)
        elif second < 7:
            if syllables == 0:
                first_line.append(token)
            else:
                second += syllables
                second_line.append(token)
        else:
            if syllables == 0:
                first_line.append(token)
            else:
                third += syllables
                third_line.append(token)
    if any([first != 5, second != 7, third != 5]):
        return None
    first_line = detokenize(first_line)
    second_line = detokenize(second_line)
    third_line = detokenize(third_line)
    return first_line, second_line, third_line
def decoding(loaded_model, test_dataset, arg_parser):
    beam_size = arg_parser.beam_size
    max_len = arg_parser.max_decode_len
    decoding_method = loaded_model.beam_search if arg_parser.decoding == 'beam_search' else loaded_model.decode_greedy
    loaded_model.eval()
    model_outputs = []
    model_outputs_kb = []
    gold_queries_kb = []
    gold_queries = []
    with torch.no_grad():
        for src_sent_batch, gold_target in tqdm(data_iterator(test_dataset,
                                                              batch_size=1,
                                                              shuffle=False),
                                                total=280):
            example_hyps = decoding_method(src_sent=src_sent_batch,
                                           max_len=max_len,
                                           beam_size=beam_size)
            strings_model = [
                detokenize(example_hyp) for example_hyp in example_hyps
            ]
            string_gold = gold_target[0]
            model_outputs_kb.append(strings_model)
            gold_queries_kb.append(string_gold)
            strings_model, string_gold = format_lf(strings_model, string_gold)
            model_outputs.append(strings_model)
            gold_queries.append(string_gold)
    return model_outputs, gold_queries, model_outputs_kb, gold_queries_kb
示例#3
0
def main():
    vocab = Vocab('bert-base-uncased')
    test_inputs = get_dataset_finish_by('geoQueryData', 'train', '_recomb.tsv')
    with open('tokenization_tests.txt', 'w') as test_file:
        test_file.truncate()
        num_matches = 0
        total_examples = 0
        for batch_idx, batch_examples in enumerate(
                data_iterator(test_inputs, batch_size=1, shuffle=False)):
            tokens_list = vocab.to_input_tokens(batch_examples[1])[0]
            detokenized = detokenize(tokens_list)
            if detokenized == batch_examples[1][0]:
                num_matches += 1
            else:
                test_file.write('wrong example:\n')
                test_file.write(batch_examples[1][0] + '\n')
                test_file.write(detokenized + '\n')
                test_file.write('\n' + '-' * 15 + '\n')
            total_examples += 1
        print(
            f"we obtained the following result: {num_matches / total_examples:.2f} accuracy for detokenization method on given dataset"
        )
    return
示例#4
0
def predictions2json(dataset, intents_pred, slots_pred, outfile):
    with open(outfile, 'w') as f:
        root = {}
        idx = 0
        for stc, intent, slot in zip(dataset.stcs_literals, intents_pred,
                                     slots_pred):

            entry = {}
            entry['intent'] = dataset.intent_converter.id2T(intent)
            #             entry ['text'] = stc

            previd = -1
            val = ''

            slot_entry = {}

            for stc_idx, i in enumerate(slot):

                key = dataset.slots_converter.id2T(i)
                if key != '-':
                    if previd == i:
                        slot_entry[key] += stc[stc_idx] + ' '
                    else:

                        previd = i
                        slot_entry[key] = stc[stc_idx] + ' '

            for key, val in slot_entry.items():
                stoks = slot_entry[key].rstrip()
                slot_entry[key] = detokenize(stoks.split())

            entry['slots'] = slot_entry

            root[str(idx)] = entry
            idx += 1
        json.dump(root, f, indent=3, ensure_ascii=False)
    root.setLevel(logging.DEBUG)

    dictionary, rev_dict = utils.get_dictionary(args.text)
    num_classes = len(dictionary)

    iterator = utils.tokenize(args.text,
                              dictionary,
                              batch_size=args.batch_size,
                              seq_len=args.seq_len)

    sess = tf.Session()
    model = SeqGAN(sess,
                   num_classes,
                   logdir=args.logdir,
                   learn_phase=args.learn_phase,
                   only_cpu=args.only_cpu)
    model.build()
    model.load(ignore_missing=True)

    for epoch in xrange(1, args.num_epochs + 1):
        for step in xrange(1, args.num_steps + 1):
            logging.info('epoch %d, step %d', epoch, step)
            model.train_batch(iterator.next())

        # Generates a sample from the model.
        g = model.generate(1000)
        print(utils.detokenize(g, rev_dict))

        # Saves the model to the logdir.
        model.save()
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Sample from a trained SeqGAN model.')
    parser.add_argument('sample_len', metavar='N', type=int,
                        help='length of sample to generate')
    parser.add_argument('-t', '--dictionary', default='dictionary.pkl',
                        type=str, help='path to dictionary file')
    parser.add_argument('-d', '--logdir', default='model/', type=str,
                        help='directory of the trained model')
    parser.add_argument('-c', '--only_cpu', default=True, action='store_true',
                        help='if set, only build weights on cpu')
    args = parser.parse_args()

    if not os.path.exists(args.dictionary):
        raise ValueError('No dictionary file found: "%s". To build it, '
                         'run train.py' % args.dictionary)

    _, rev_dict = utils.get_dictionary(None, dfile=args.dictionary)
    num_classes = len(rev_dict)

    sess = tf.Session()
    model = SeqGAN(sess,
                   num_classes,
                   logdir=args.logdir,
                   only_cpu=args.only_cpu)
    model.build()
    model.load(ignore_missing=True)

    g = model.generate(args.sample_len)
    print('Generated text:', utils.detokenize(g, rev_dict))
示例#7
0
    args = parser.parse_args()

    # Turns on logging.
    import logging
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    dictionary, rev_dict = utils.get_dictionary(args.text)
    num_classes = len(dictionary)

    iterator = utils.tokenize(args.text,
                              dictionary,
                              batch_size=args.batch_size,
                              seq_len=args.seq_len)

    sess = tf.Session()
    model = SeqGAN(sess, num_classes, only_cpu=args.only_cpu)
    model.build()

    for epoch in xrange(args.num_epochs):
        for step in xrange(args.num_steps):
            logging.info('epoch %d, step %d', epoch, step)
            model.train_batch(iterator.next())

        # Generates a sample from the model.
        g = model.generate(100)
        logging.info('Epoch %d: "%s"', epoch, utils.detokenize(g, rev_dict))

        # Saves the model to the logdir.
        model.save()
示例#8
0
    # Change to image dir because textogif doesn't seem to work otherwise...
    oldcwd = os.getcwd()

    with open(sys.argv[1]) as fim2latex:
        images = {}
        for line in fim2latex:
            if len(line.strip()) > 0:
                tabular_id, image, mode = line.strip().split()
                tabular_id = int(tabular_id)
                images[tabular_id] = image
    renders = []
    with open(sys.argv[3]) as ftabulars:
        for tabular_id, line in enumerate(ftabulars):
            if len(line.strip()) > 0:
                tokens = line.strip().split(' ')
                line_out = detokenize(tokens)
                renders.append((images[tabular_id], line_out))

    # Check we are not in image dir yet (avoid exceptions)
    if not tabular_images_validate in os.getcwd():
        os.chdir(tabular_images_validate)

    pool = Pool(THREADS)
    pool.map(tabular_to_image, renders)
    os.chdir(oldcwd)

    pool = Pool(THREADS)
    total_match = 0
    num_total = 0
    for match in pool.imap(
            calc_match,