Пример #1
0
def main():
    parser = yaap.ArgParser(allow_config=True)
    parser.add("--vectorizer-config", type=yaap.path, required=True)
    parser.add("--senteval-data", type=yaap.path, required=True)
    parser.add("--tasks", type=str, action="append", required=True)
    parser.add("--batch-size", type=int, default=None)

    args = parser.parse_args()

    assert os.path.exists(args.vectorizer_config)

    with open(args.vectorizer_config, "r") as f:
        vec_conf = yaml.load(f)

    if args.batch_size is None:
        assert "batch-size" in vec_conf

        batch_size = vec_conf.get("batch-size")
    else:
        batch_size = args.batch_size

    sv = SentenceVectorizer(args.vectorizer_config)

    params = {
        "usepytorch": True,
        "task_path": args.senteval_data,
        "batch_size": batch_size,
        "model": sv,
    }

    se = senteval.SentEval(dotdict(params), batcher, prepare)
    se.eval(args.tasks)
Пример #2
0
def create_parser():
    parser = argparse.ArgParser()
    parser.add_argument("--data-dir",
                        action="append",
                        type=argparse.path,
                        required=True,
                        help="")
    parser.add_argument("--vocab-path", type=argparse.path, required=True)
    parser.add_argument("--reserved", action="append", type=str)
    parser.add_argument("--cutoff", type=int, default=30000)

    return parser
Пример #3
0
def train():
    parser = yaap.ArgParser(allow_config=True)
    parser.add("--word-dim", type=int, required=True)
    parser.add("--ckpt-path", type=yaap.path, required=True)
    parser.add("--vocab-path", type=yaap.path, required=True)
    parser.add("--save-path", type=yaap.path, required=True)
    parser.add("--wordembed-type",
               type=str,
               required=True,
               choices=["glove", "fasttext"])
    parser.add("--wordembed-path", type=yaap.path, required=True)
    parser.add("--fasttext-path",
               type=yaap.path,
               default=None,
               help="Path to FastText binary.")
    parser.add("--wordembed-processes", type=int, default=1)
    parser.add("-v", "--verbose", action="store_true", default=False)

    group = parser.add_group("Training Options")
    group.add_argument("--epochs", type=int, default=10)
    group.add_argument("--batch-size", type=int, default=32)
    group.add_argument("--method",
                       type=str,
                       default="pytorch",
                       choices=["pytorch", "sklearn"])
    group.add_argument("--loss",
                       type=str,
                       default="smoothl1",
                       choices=["smoothl1", "l2", "l1"])
    group.add_argument("--gpu", action="store_true", default=False)
    group.add_argument("--no-shuffle", action="store_true", default=False)

    args = parser.parse_args()

    if args.verbose:
        loglvl = logging.INFO
    else:
        loglvl = logging.CRITICAL

    logging.basicConfig(level=loglvl)
    logging.info("initializing...")

    word_dim = args.word_dim
    assert os.path.exists(os.path.dirname(args.save_path)), "base directory"\
        "for saving translation weights does not exist"
    with open(args.vocab_path, "rb") as f:
        vocab = pickle.load(f)

    assert args.wordembed_processes >= 1, "number of processes must be " \
                                          "larger than or equal to 1"

    if args.wordembed_processes == 1:
        embedding_loader = load_embeddings
    else:

        def embedding_loader(path, word_dim):
            return load_embeddings_mp(path,
                                      word_dim,
                                      processes=args.wordembed_processes)

    logging.info("training translation model of word_dim={}".format(word_dim))

    logging.info("loading target word embeddings...")

    model_we = ModelWordEmbedding(vocab, word_dim)
    model_we.load_embeddings(torch.load(args.ckpt_path))
    target_we = model_we.embeddings.weight.data
    target_we = build_wordembed_dict(target_we, vocab)

    logging.info("loading source word embeddings...")
    if args.wordembed_type == "glove":
        glove_path = args.wordembed_path
        src_embeddings = embedding_loader(glove_path, word_dim)
    elif args.wordembed_type == "fasttext":
        fasttext_path = args.fasttext_path
        assert fasttext_path is not None, "if wordembed type is fasttext, "\
            "you must provide path to fasttext binary"

        model_path = args.wordembed_path
        src_embeddings = load_fasttext_embeddings(vocab, fasttext_path,
                                                  model_path)
    else:
        raise ValueError("Unrecognized wordembed type: {}".format(
            args.wordembed_type))

    logging.info("joining source and target word embeddings...")
    src_we, target_we = join_embeddings(src_embeddings, target_we)

    logging.info("preparing training environment...")
    if args.method == "sklearn":
        from sklearn.linear_model import LinearRegression

        model = LinearRegression(n_jobs=7)
    elif args.method == "pytorch":
        model = WordEmbeddingTranslator(word_dim)
    else:
        assert False

    logging.info("beginning training...")
    if args.method == "sklearn":
        src_we = src_we.numpy()
        target_we = target_we.numpy()
        model.fit(src_we, target_we)
    elif args.method == "pytorch":
        data_generator = WordEmbeddingTranslationGenerator(
            src=src_we,
            target=target_we,
            shuffle=not args.no_shuffle,
            batch_size=args.batch_size)
        trainer = WordEmbeddingTranslatorTrainer(model=model,
                                                 data_generator=data_generator,
                                                 epochs=args.epochs,
                                                 loss=args.loss)
        trainer.train()
    else:
        assert False

    logging.info("saving results...")
    if args.method == "sklearn":
        pickle.dump(model, open(args.save_path, "wb"))
    elif args.method == "pytorch":
        torch.save(model.state_dict(), args.save_path)

    logging.info("done!")
Пример #4
0
import numpy as np
import yaap
import tqdm
import torch
import torch.nn as nn
import torch.optim as O
import torch.autograd as A

import utils
import data as D
import model as M
import evaluate as E


parser = yaap.ArgParser(
    allow_config=True,
    formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

group = parser.add_group("Basic Options")
group.add("--save-dir", type=yaap.path, required=True,
          help="Directory to save outputs (checkpoints, vocabs, etc.)")
group.add("--gpu", type=int, action="append",
          help="Device id of gpu to use. Could supply multiple gpu ids "
               "to denote multi-gpu utilization. If no gpus are "
               "specified, cpu is used as default.")
group.add("--tensorboard", action="store_true", default=False,
          help="Whether to enable tensorboard visualization. Requires "
               "standalone tensorboard, which can be installed via "
               "'https://github.com/dmlc/tensorboard'.")

group = parser.add_group("Word Embedding Options")
Пример #5
0
import os
import pickle

import yaap

parser = yaap.ArgParser()
parser.add_argument("--pickle-path", type=yaap.path, default="atis.pkl")
parser.add_argument("--save-dir", type=yaap.path, default="data")


def to_text(words, nes, labels, i2w, i2n, i2l, save_dir):
    word_path = os.path.join(save_dir, "words.txt")
    nes_path = os.path.join(save_dir, "nes.txt")
    label_path = os.path.join(save_dir, "labels.txt")
    word_f = open(word_path, "w")
    nes_f = open(nes_path, "w")
    label_f = open(label_path, "w")

    for w, n, l in zip(words, nes, labels):
        word_f.write(" ".join(i2w[x] for x in w) + "\n")
        nes_f.write(" ".join(i2n[x] for x in n) + "\n")
        label_f.write(" ".join(i2l[x] for x in l) + "\n")


def preprocess(args):
    os.makedirs(args.save_dir, exist_ok=True)
    data = pickle.load(open(args.pickle_path, "rb"), encoding="latin1")
    train, test, vocabs = data
    w2i, n2i, l2i = vocabs["words2idx"], vocabs["tables2idx"], vocabs[
        "labels2idx"]
    i2w, i2n, i2l = [{v: k