def main(): parser = yaap.ArgParser(allow_config=True) parser.add("--vectorizer-config", type=yaap.path, required=True) parser.add("--senteval-data", type=yaap.path, required=True) parser.add("--tasks", type=str, action="append", required=True) parser.add("--batch-size", type=int, default=None) args = parser.parse_args() assert os.path.exists(args.vectorizer_config) with open(args.vectorizer_config, "r") as f: vec_conf = yaml.load(f) if args.batch_size is None: assert "batch-size" in vec_conf batch_size = vec_conf.get("batch-size") else: batch_size = args.batch_size sv = SentenceVectorizer(args.vectorizer_config) params = { "usepytorch": True, "task_path": args.senteval_data, "batch_size": batch_size, "model": sv, } se = senteval.SentEval(dotdict(params), batcher, prepare) se.eval(args.tasks)
def create_parser(): parser = argparse.ArgParser() parser.add_argument("--data-dir", action="append", type=argparse.path, required=True, help="") parser.add_argument("--vocab-path", type=argparse.path, required=True) parser.add_argument("--reserved", action="append", type=str) parser.add_argument("--cutoff", type=int, default=30000) return parser
def train(): parser = yaap.ArgParser(allow_config=True) parser.add("--word-dim", type=int, required=True) parser.add("--ckpt-path", type=yaap.path, required=True) parser.add("--vocab-path", type=yaap.path, required=True) parser.add("--save-path", type=yaap.path, required=True) parser.add("--wordembed-type", type=str, required=True, choices=["glove", "fasttext"]) parser.add("--wordembed-path", type=yaap.path, required=True) parser.add("--fasttext-path", type=yaap.path, default=None, help="Path to FastText binary.") parser.add("--wordembed-processes", type=int, default=1) parser.add("-v", "--verbose", action="store_true", default=False) group = parser.add_group("Training Options") group.add_argument("--epochs", type=int, default=10) group.add_argument("--batch-size", type=int, default=32) group.add_argument("--method", type=str, default="pytorch", choices=["pytorch", "sklearn"]) group.add_argument("--loss", type=str, default="smoothl1", choices=["smoothl1", "l2", "l1"]) group.add_argument("--gpu", action="store_true", default=False) group.add_argument("--no-shuffle", action="store_true", default=False) args = parser.parse_args() if args.verbose: loglvl = logging.INFO else: loglvl = logging.CRITICAL logging.basicConfig(level=loglvl) logging.info("initializing...") word_dim = args.word_dim assert os.path.exists(os.path.dirname(args.save_path)), "base directory"\ "for saving translation weights does not exist" with open(args.vocab_path, "rb") as f: vocab = pickle.load(f) assert args.wordembed_processes >= 1, "number of processes must be " \ "larger than or equal to 1" if args.wordembed_processes == 1: embedding_loader = load_embeddings else: def embedding_loader(path, word_dim): return load_embeddings_mp(path, word_dim, processes=args.wordembed_processes) logging.info("training translation model of word_dim={}".format(word_dim)) logging.info("loading target word embeddings...") model_we = ModelWordEmbedding(vocab, word_dim) model_we.load_embeddings(torch.load(args.ckpt_path)) target_we = model_we.embeddings.weight.data target_we = build_wordembed_dict(target_we, vocab) logging.info("loading source word embeddings...") if args.wordembed_type == "glove": glove_path = args.wordembed_path src_embeddings = embedding_loader(glove_path, word_dim) elif args.wordembed_type == "fasttext": fasttext_path = args.fasttext_path assert fasttext_path is not None, "if wordembed type is fasttext, "\ "you must provide path to fasttext binary" model_path = args.wordembed_path src_embeddings = load_fasttext_embeddings(vocab, fasttext_path, model_path) else: raise ValueError("Unrecognized wordembed type: {}".format( args.wordembed_type)) logging.info("joining source and target word embeddings...") src_we, target_we = join_embeddings(src_embeddings, target_we) logging.info("preparing training environment...") if args.method == "sklearn": from sklearn.linear_model import LinearRegression model = LinearRegression(n_jobs=7) elif args.method == "pytorch": model = WordEmbeddingTranslator(word_dim) else: assert False logging.info("beginning training...") if args.method == "sklearn": src_we = src_we.numpy() target_we = target_we.numpy() model.fit(src_we, target_we) elif args.method == "pytorch": data_generator = WordEmbeddingTranslationGenerator( src=src_we, target=target_we, shuffle=not args.no_shuffle, batch_size=args.batch_size) trainer = WordEmbeddingTranslatorTrainer(model=model, data_generator=data_generator, epochs=args.epochs, loss=args.loss) trainer.train() else: assert False logging.info("saving results...") if args.method == "sklearn": pickle.dump(model, open(args.save_path, "wb")) elif args.method == "pytorch": torch.save(model.state_dict(), args.save_path) logging.info("done!")
import numpy as np import yaap import tqdm import torch import torch.nn as nn import torch.optim as O import torch.autograd as A import utils import data as D import model as M import evaluate as E parser = yaap.ArgParser( allow_config=True, formatter_class=argparse.ArgumentDefaultsHelpFormatter ) group = parser.add_group("Basic Options") group.add("--save-dir", type=yaap.path, required=True, help="Directory to save outputs (checkpoints, vocabs, etc.)") group.add("--gpu", type=int, action="append", help="Device id of gpu to use. Could supply multiple gpu ids " "to denote multi-gpu utilization. If no gpus are " "specified, cpu is used as default.") group.add("--tensorboard", action="store_true", default=False, help="Whether to enable tensorboard visualization. Requires " "standalone tensorboard, which can be installed via " "'https://github.com/dmlc/tensorboard'.") group = parser.add_group("Word Embedding Options")
import os import pickle import yaap parser = yaap.ArgParser() parser.add_argument("--pickle-path", type=yaap.path, default="atis.pkl") parser.add_argument("--save-dir", type=yaap.path, default="data") def to_text(words, nes, labels, i2w, i2n, i2l, save_dir): word_path = os.path.join(save_dir, "words.txt") nes_path = os.path.join(save_dir, "nes.txt") label_path = os.path.join(save_dir, "labels.txt") word_f = open(word_path, "w") nes_f = open(nes_path, "w") label_f = open(label_path, "w") for w, n, l in zip(words, nes, labels): word_f.write(" ".join(i2w[x] for x in w) + "\n") nes_f.write(" ".join(i2n[x] for x in n) + "\n") label_f.write(" ".join(i2l[x] for x in l) + "\n") def preprocess(args): os.makedirs(args.save_dir, exist_ok=True) data = pickle.load(open(args.pickle_path, "rb"), encoding="latin1") train, test, vocabs = data w2i, n2i, l2i = vocabs["words2idx"], vocabs["tables2idx"], vocabs[ "labels2idx"] i2w, i2n, i2l = [{v: k