Пример #1
0
Файл: a.py Проект: springle/aoc
def read_group_answers(path: str) -> list:
    group_answers = []
    for chunk in read_chunks(path):
        answers = set()
        for line in chunk:
            for letter in line:
                if re.match(ANSWER_PATTERN, letter):
                    answers.add(letter)

        group_answers.append(answers)

    return group_answers
Пример #2
0
def main_test():
    import util
    import vocabulary
    import parse
    label_list = util.load_label_list('data/labels.txt')
    label_vocab = vocabulary.Vocabulary()

    label_vocab.index(())


    for item in label_list:
        label_vocab.index((item,))

    for item in label_list:
        label_vocab.index((item + "'",))

    label_vocab.index((parse.EMPTY,))

    label_vocab.freeze()

    latent = latent_tree_builder(label_vocab, 'city')

    insts = util.read_chunks('data/trial.txt')


    # for k in range(3):
    #     trees = latent.build_latent_trees(insts)
    #     for tree in trees:
    #         print(tree.linearize())
    #     print()


    trees = latent.build_dynamicRBT_trees(insts)
    for x, tree, chunks, latentscope in trees:
        print(tree.linearize())
        tree = tree.convert()
        print()
        tree = tree.convert()
    print()



#main_test()
Пример #3
0
def run_test2(args):

    model = dy.ParameterCollection()
    # [parser] = dy.load(args.model_path_base, model)
    [parser] = dy.load(
        "models/chartdyRBTC-model_addr_dytree_giga_0.4_200_1_chartdyRBTC_dytree_1_houseno_0_0_dev=0.90",
        model)

    test_chunk_insts = util.read_chunks(args.test_path, args.normal)

    # ftreelog = open(args.expname + '.test.predtree.txt', 'w', encoding='utf-8')
    ftreelog = open('aaa' + '.test.predtree.txt', 'w', encoding='utf-8')
    test_predicted = []
    test_start_time = time.time()
    test_predicted = []
    test_gold = []
    for inst in test_chunk_insts:
        chunks = util.inst2chunks(inst)
        test_gold.append(chunks)

    for x, chunks in test_chunk_insts:
        dy.renew_cg()
        sentence = [(parse.XX, ch) for ch in x]
        predicted, _ = parser.parse(sentence)
        pred_tree = predicted.convert()
        ftreelog.write(pred_tree.linearize() + '\n')
        test_predicted.append(pred_tree.to_chunks())

    ftreelog.close()

    # test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename=args.expname + '.test.txt')  # evalb
    test_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                        test_gold,
                                        test_predicted,
                                        output_filename='aaaabbbb' +
                                        '.test.txt')  # evalb

    print("test-fscore {} "
          "test-elapsed {} ".format(
              test_fscore,
              format_elapsed(test_start_time),
          ))
Пример #4
0
Файл: b.py Проект: springle/aoc
def count_unanimous_answers(path: str) -> list:
    """
    In O(num_lines) time,
    read each groups' answers,
    and count how many are unanimous.
    """
    group_answers = []
    for chunk in read_chunks(path):
        answers, responses = defaultdict(int), list(chunk)
        for response in responses:
            for letter in response:
                if re.match(ANSWER_PATTERN, letter):
                    answers[letter] += 1

        everyone_said_yes = 0
        for answer, count in answers.items():
            if count == len(responses):
                everyone_said_yes += 1

        group_answers.append(everyone_said_yes)

    return group_answers
Пример #5
0
def run_train(args):

    args.numpy_seed = seed
    if args.numpy_seed is not None:
        print("Setting numpy random seed to {}...".format(args.numpy_seed))
        np.random.seed(args.numpy_seed)

    if args.trial == 1:
        args.train_path = 'data/trial.txt'
        args.dev_path = 'data/trial.txt'
        args.test_path = 'data/trial.txt'

    # args.train_path = args.train_path.replace('[*]', args.treetype)
    # args.dev_path = args.dev_path.replace('[*]', args.treetype)
    # args.test_path = args.test_path.replace('[*]', args.treetype)

    print("Loading training trees from {}...".format(args.train_path))
    train_chunk_insts = util.read_chunks(args.train_path, args.normal)
    print("Loaded {:,} training examples.".format(len(train_chunk_insts)))

    print("Loading development trees from {}...".format(args.dev_path))
    dev_chunk_insts = util.read_chunks(args.dev_path, args.normal)
    print("Loaded {:,} development examples.".format(len(dev_chunk_insts)))

    print("Loading test trees from {}...".format(args.test_path))
    test_chunk_insts = util.read_chunks(args.test_path, args.normal)
    print("Loaded {:,} test examples.".format(len(test_chunk_insts)))

    # print("Processing trees for training...")
    # train_parse = [tree.convert() for tree in train_treebank]

    print("Constructing vocabularies...")

    tag_vocab = vocabulary.Vocabulary()
    tag_vocab.index(parse.START)
    tag_vocab.index(parse.STOP)
    tag_vocab.index(parse.XX)

    word_vocab = vocabulary.Vocabulary()
    word_vocab.index(parse.START)
    word_vocab.index(parse.STOP)
    word_vocab.index(parse.UNK)
    word_vocab.index(parse.NUM)

    for x, chunks in train_chunk_insts + dev_chunk_insts + test_chunk_insts:
        for ch in x:
            word_vocab.index(ch)

    label_vocab = vocabulary.Vocabulary()
    label_vocab.index(())

    label_list = util.load_label_list(args.labellist_path)  #'data/labels.txt')
    for item in label_list:
        label_vocab.index((item, ))

    if args.nontlabelstyle != 1:
        for item in label_list:
            label_vocab.index((item + "'", ))

    if args.nontlabelstyle == 1:
        label_vocab.index((parse.EMPTY, ))

    tag_vocab.freeze()
    word_vocab.freeze()
    label_vocab.freeze()

    latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel,
                                             args.nontlabelstyle)

    def print_vocabulary(name, vocab):
        special = {parse.START, parse.STOP, parse.UNK}
        print("{} ({:,}): {}".format(
            name, vocab.size,
            sorted(value for value in vocab.values if value in special) +
            sorted(value for value in vocab.values if value not in special)))

    if args.print_vocabs:
        print_vocabulary("Tag", tag_vocab)
        print_vocabulary("Word", word_vocab)
        print_vocabulary("Label", label_vocab)

    print("Initializing model...")

    pretrain = {'giga': 'data/giga.vec100', 'none': 'none'}
    pretrainemb = util.load_pretrain(pretrain[args.pretrainemb],
                                     args.word_embedding_dim, word_vocab)

    model = dy.ParameterCollection()
    if args.parser_type == "chartdyRBTC":
        parser = parse.ChartDynamicRBTConstraintParser(
            model,
            tag_vocab,
            word_vocab,
            label_vocab,
            args.tag_embedding_dim,
            args.word_embedding_dim,
            args.lstm_layers,
            args.lstm_dim,
            args.label_hidden_dim,
            args.dropout,
            (args.pretrainemb, pretrainemb),
            args.chunkencoding,
            args.trainc == 1,
            True,
            (args.zerocostchunk == 1),
        )

    else:
        print('Model is not valid!')
        exit()

    if args.loadmodel != 'none':
        tmp = dy.load(args.loadmodel, model)
        parser = tmp[0]
        print('Model is loaded from ', args.loadmodel)

    trainer = dy.AdamTrainer(model)

    total_processed = 0
    current_processed = 0
    check_every = len(train_chunk_insts) / args.checks_per_epoch
    best_dev_fscore = -np.inf
    best_dev_model_path = None

    start_time = time.time()

    def check_dev():
        nonlocal best_dev_fscore
        nonlocal best_dev_model_path

        dev_start_time = time.time()

        dev_predicted = []
        #dev_gold = []

        #dev_gold = latent_tree.build_latent_trees(dev_chunk_insts)
        dev_gold = []
        for inst in dev_chunk_insts:
            chunks = util.inst2chunks(inst)
            dev_gold.append(chunks)

        for x, chunks in dev_chunk_insts:
            dy.renew_cg()
            #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
            sentence = [(parse.XX, ch) for ch in x]
            predicted, _ = parser.parse(sentence)
            dev_predicted.append(predicted.convert().to_chunks())

        #dev_fscore = evaluate.evalb(args.evalb_dir, dev_gold, dev_predicted, args.expname + '.dev.') #evalb
        dev_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                           dev_gold,
                                           dev_predicted,
                                           output_filename=args.expname +
                                           '.dev.txt')  # evalb

        print("dev-fscore {} "
              "dev-elapsed {} "
              "total-elapsed {}".format(
                  dev_fscore,
                  format_elapsed(dev_start_time),
                  format_elapsed(start_time),
              ))

        if dev_fscore.fscore > best_dev_fscore:
            if best_dev_model_path is not None:
                for ext in [".data", ".meta"]:
                    path = best_dev_model_path + ext
                    if os.path.exists(path):
                        print(
                            "Removing previous model file {}...".format(path))
                        os.remove(path)

            best_dev_fscore = dev_fscore.fscore
            best_dev_model_path = "{}_dev={:.2f}".format(
                args.model_path_base + "_" + args.expname, dev_fscore.fscore)
            print("Saving new best model to {}...".format(best_dev_model_path))
            dy.save(best_dev_model_path, [parser])

            test_start_time = time.time()
            test_predicted = []
            #test_gold = latent_tree.build_latent_trees(test_chunk_insts)
            test_gold = []
            for inst in test_chunk_insts:
                chunks = util.inst2chunks(inst)
                test_gold.append(chunks)

            ftreelog = open(args.expname + '.test.predtree.txt',
                            'w',
                            encoding='utf-8')

            for x, chunks in test_chunk_insts:
                dy.renew_cg()
                #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()]
                sentence = [(parse.XX, ch) for ch in x]
                predicted, _ = parser.parse(sentence)
                pred_tree = predicted.convert()
                ftreelog.write(pred_tree.linearize() + '\n')
                test_predicted.append(pred_tree.to_chunks())

            ftreelog.close()

            #test_fscore = evaluate.evalb(args.evalb_dir, test_chunk_insts, test_predicted, args.expname + '.test.')
            test_fscore = evaluate.eval_chunks2(args.evalb_dir,
                                                test_gold,
                                                test_predicted,
                                                output_filename=args.expname +
                                                '.test.txt')  # evalb

            print("epoch {:,} "
                  "test-fscore {} "
                  "test-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      test_fscore,
                      format_elapsed(test_start_time),
                      format_elapsed(start_time),
                  ))

    train_trees = latent_tree.build_dynamicRBT_trees(train_chunk_insts)
    train_trees = [(x, tree.convert(), chunks, latentscope)
                   for x, tree, chunks, latentscope in train_trees]

    for epoch in itertools.count(start=1):
        if args.epochs is not None and epoch > args.epochs:
            break

        np.random.shuffle(train_chunk_insts)
        epoch_start_time = time.time()

        for start_index in range(0, len(train_chunk_insts), args.batch_size):
            dy.renew_cg()
            batch_losses = []

            for x, tree, chunks, latentscope in train_trees[
                    start_index:start_index + args.batch_size]:

                discard = False
                for chunk in chunks:
                    length = chunk[2] - chunk[1]
                    if length > args.maxllimit:
                        discard = True
                        break

                if discard:
                    continue
                    print('discard')

                sentence = [(parse.XX, ch) for ch in x]
                if args.parser_type == "top-down":
                    _, loss = parser.parse(sentence, tree, args.explore)
                else:
                    _, loss = parser.parse(sentence, tree, chunks, latentscope)
                batch_losses.append(loss)
                total_processed += 1
                current_processed += 1

            batch_loss = dy.average(batch_losses)
            batch_loss_value = batch_loss.scalar_value()
            batch_loss.backward()
            trainer.update()

            print("Epoch {:,} "
                  "batch {:,}/{:,} "
                  "processed {:,} "
                  "batch-loss {:.4f} "
                  "epoch-elapsed {} "
                  "total-elapsed {}".format(
                      epoch,
                      start_index // args.batch_size + 1,
                      int(np.ceil(len(train_chunk_insts) / args.batch_size)),
                      total_processed,
                      batch_loss_value,
                      format_elapsed(epoch_start_time),
                      format_elapsed(start_time),
                  ),
                  flush=True)

            if current_processed >= check_every:
                current_processed -= check_every
                if epoch > 7:
                    check_dev()