Exemplo n.º 1
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("mode", type=str, help="type in mode")
    parser.add_argument("filename",
                        type=str,
                        help="type in the grammar filepath")
    parser.add_argument("sentence",
                        type=str,
                        help="filepath that contains sentences")
    args = parser.parse_args()

    gr = util.load_grammar(args.filename)

    if args.mode == 'RECOGNIZER':
        sent = util.load_sentence(args.sentence)
        for s in sent:
            length = len(s.split())
            rslt = CKY(s, gr)
            ifroot = False
            for r in rslt[0][length - 1]:
                if r[0] == 'ROOT':
                    ifroot = True
            if len(rslt[0][length - 1]) != 0 and ifroot:
                print('True')
                continue
            print('False')
        exit()

    if args.mode == 'BEST-PARSE':
        sent = util.load_sentence(args.sentence)
Exemplo n.º 2
0
    )
    if os.path.exists(init_model_name):
        serializers.load_npz(init_model_name, model)
        print("load model {}".format(init_model_name))

    elif word2vec_init:
        # initialize embedding layer by word2vec
        import numpy as np

        if os.path.exists(word2vec_model_file):
            print("load word2vec model")
            word2vec_model = word2vec.Word2Vec.load(word2vec_model_file)
        else:
            print("start learning word2vec model")
            word2vec_model = word2vec.Word2Vec(
                load_sentence(sent_file),
                size=n_units,
                window=5,
                min_count=1,
                workers=4
            )
            print("save word2vec model")
            word2vec_model.save(word2vec_model_file)

        # initialize word embedding layer with word2vec
        initial_W = np.array([
            word2vec_model[dictionary[wid]]
            if dictionary[wid] in word2vec_model
            else np.array(
                [np.random.random() for _ in range(n_units)],
                dtype=np.float32
Exemplo n.º 3
0
def train_encoder(model,
                  dictionary: corpora.Dictionary,
                  sentence_file: str,
                  model_dir: str,
                  epoch_size: int = 100,
                  batch_size: int = 30,
                  dropout: bool = True,
                  gpu: bool = False) -> None:
    if gpu >= 0:
        model.to_gpu()
        print(model.xp)

    # setup SGD optimizer
    opt = optimizers.SGD()
    opt.setup(model)

    # optimizer hooks
    clip_threshold = 5.0
    print("set optimizer clip threshold: {}".format(clip_threshold))
    opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold))

    # load conversation sentences
    sentences = load_sentence(sentence_file)
    data_size = len(sentences)

    print("data size: {}".format(data_size))
    for epoch in range(epoch_size):
        print("epoch {}".format(epoch))

        indexes = np.random.permutation(data_size)
        epoch_loss = 0  # int

        for bat_i in range(0, data_size, batch_size):
            forward_start_time = datetime.now()
            batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32))

            for index in indexes[bat_i:bat_i + batch_size]:
                input_words = sentences[index]
                # id のリストに変換する
                input_words_with_s = tokens2ids(input_words,
                                                dictionary,
                                                verbose=False)

                # フォワード
                try:
                    new_loss = model(input_words_with_s,
                                     dropout=dropout,
                                     state=None,
                                     train=True)
                    if model.xp.isnan(new_loss.data):
                        sys.exit(1)

                    batch_loss += new_loss
                except Exception:
                    print(index, input_words_with_s)
                    import traceback
                    traceback.print_exc()

            # 平均化
            batch_size_array = model.xp.array(batch_size,
                                              dtype=model.xp.float32)
            # if gpu:
            #     batch_size_array = cuda.to_gpu(batch_size_array)
            batch_loss = batch_loss / Variable(batch_size_array)
            epoch_loss += batch_loss.data

            # 時間計測
            forward_end_time = datetime.now()

            # 最適化
            opt_start_time = datetime.now()
            model.zerograds()
            batch_loss.backward()
            opt.update()
            opt_end_time = datetime.now()

            forward_delta = forward_end_time - forward_start_time
            opt_delta = opt_end_time - opt_start_time

            print_fmt = ("epoch {} batch {}: "
                         "loss {}, grad L2 norm: {}, forward {}, optimizer {}")
            print(
                print_fmt.format(
                    epoch,
                    int(bat_i / batch_size),
                    batch_loss.data,
                    opt.compute_grads_norm(),
                    forward_delta,
                    opt_delta,
                ))
            # save
            if ((bat_i / batch_size) + 1) % 100 == 0:
                serializers.save_npz(os.path.join(model_dir, "model.npz"),
                                     model)
            if ((bat_i / batch_size) + 1) % 1000 == 0:
                serializers.save_npz(
                    os.path.join(
                        model_dir, "model_{}_{}_{}.npz".format(
                            epoch,
                            int(bat_i / batch_size) + 1,
                            datetime.now().strftime("%Y%m%d-%H%M%S"))), model)
        print("finish epoch {}, loss {}".format(epoch,
                                                epoch_loss / epoch_size))
        # save
        serializers.save_npz(os.path.join(model_dir, "model.npz"), model)
        serializers.save_npz(
            os.path.join(
                model_dir, "model_{}_{}_{}.npz".format(
                    epoch,
                    int(bat_i / batch_size) + 1,
                    datetime.now().strftime("%Y%m%d-%H%M%S"))), model)
Exemplo n.º 4
0
def train_encoder(
    model,
    dictionary: corpora.Dictionary,
    sentence_file: str,
    model_dir: str,
    epoch_size: int=100,
    batch_size: int=30,
    dropout: bool=True,
    gpu: bool=False
) -> None:
    if gpu >= 0:
        model.to_gpu()
        print(model.xp)

    # setup SGD optimizer
    opt = optimizers.SGD()
    opt.setup(model)

    # optimizer hooks
    clip_threshold = 5.0
    print("set optimizer clip threshold: {}".format(clip_threshold))
    opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold))

    # load conversation sentences
    sentences = load_sentence(sentence_file)
    data_size = len(sentences)

    print("data size: {}".format(data_size))
    for epoch in range(epoch_size):
        print("epoch {}".format(epoch))

        indexes = np.random.permutation(data_size)
        epoch_loss = 0  # int

        for bat_i in range(0, data_size, batch_size):
            forward_start_time = datetime.now()
            batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32))

            for index in indexes[bat_i:bat_i + batch_size]:
                input_words = sentences[index]
                # id のリストに変換する
                input_words_with_s = tokens2ids(
                    input_words,
                    dictionary,
                    verbose=False
                )

                # フォワード
                try:
                    new_loss = model(
                        input_words_with_s,
                        dropout=dropout,
                        state=None,
                        train=True
                    )
                    if model.xp.isnan(new_loss.data):
                        sys.exit(1)

                    batch_loss += new_loss
                except Exception:
                    print(index, input_words_with_s)
                    import traceback
                    traceback.print_exc()

            # 平均化
            batch_size_array = model.xp.array(
                batch_size,
                dtype=model.xp.float32
            )
            # if gpu:
            #     batch_size_array = cuda.to_gpu(batch_size_array)
            batch_loss = batch_loss / Variable(batch_size_array)
            epoch_loss += batch_loss.data

            # 時間計測
            forward_end_time = datetime.now()

            # 最適化
            opt_start_time = datetime.now()
            model.zerograds()
            batch_loss.backward()
            opt.update()
            opt_end_time = datetime.now()

            forward_delta = forward_end_time - forward_start_time
            opt_delta = opt_end_time - opt_start_time

            print_fmt = (
                "epoch {} batch {}: "
                "loss {}, grad L2 norm: {}, forward {}, optimizer {}"
            )
            print(print_fmt.format(
                epoch,
                int(bat_i / batch_size),
                batch_loss.data,
                opt.compute_grads_norm(),
                forward_delta,
                opt_delta,
            ))
            # save
            if ((bat_i / batch_size) + 1) % 100 == 0:
                serializers.save_npz(
                    os.path.join(
                        model_dir,
                        "model.npz"
                    ),
                    model
                )
            if ((bat_i / batch_size) + 1) % 1000 == 0:
                serializers.save_npz(
                    os.path.join(
                        model_dir,
                        "model_{}_{}_{}.npz".format(
                            epoch,
                            int(bat_i / batch_size) + 1,
                            datetime.now().strftime("%Y%m%d-%H%M%S")
                        )
                    ),
                    model
                )
        print("finish epoch {}, loss {}".format(
            epoch,
            epoch_loss / epoch_size
        ))
        # save
        serializers.save_npz(
            os.path.join(
                model_dir,
                "model.npz"
            ),
            model
        )
        serializers.save_npz(
            os.path.join(
                model_dir,
                "model_{}_{}_{}.npz".format(
                    epoch,
                    int(bat_i / batch_size) + 1,
                    datetime.now().strftime("%Y%m%d-%H%M%S")
                )
            ),
            model
        )
Exemplo n.º 5
0
    # load model
    init_model_name = os.path.join(model_dir, "model.npz")
    if os.path.exists(init_model_name):
        serializers.load_npz(init_model_name, model)
        print("load model {}".format(init_model_name))

    elif word2vec_init:
        # initialize embedding layer by word2vec
        import numpy as np

        if os.path.exists(word2vec_model_file):
            print("load word2vec model")
            word2vec_model = word2vec.Word2Vec.load(word2vec_model_file)
        else:
            print("start learning word2vec model")
            word2vec_model = word2vec.Word2Vec(load_sentence(sent_file),
                                               size=n_units,
                                               window=5,
                                               min_count=1,
                                               workers=4)
            print("save word2vec model")
            word2vec_model.save(word2vec_model_file)

        # initialize word embedding layer with word2vec
        initial_W = np.array([
            word2vec_model[dictionary[wid]]
            if dictionary[wid] in word2vec_model else np.array(
                [np.random.random() for _ in range(n_units)], dtype=np.float32)
            for wid in range(dim)
        ],
                             dtype=np.float32)