def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.eval() if args.prob: self.transform.append(Field('probs')) logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Making predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None and is_master(): logger.info(f"Saving predicted results to {pred}") self.transform.save(pred, dataset.sentences) logger.info(f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s") return dataset
def evaluate(self, data, buckets=8, batch_size=5000, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() logger.info("Load the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Evaluate the dataset") start = datetime.now() loss, metric = self._evaluate(dataset.loader) elapsed = datetime.now() - start logger.info(f"loss: {loss:.4f} - {metric}") tag_map = {k: self.CPOS.vocab[v] for k, v in metric.tag_map.items()} pprint(tag_map) recalled_tags = Counter(tag_map.values()) unrecalled_tags = set(self.CPOS.vocab.stoi) - set(recalled_tags.keys()) pprint(recalled_tags) pprint(unrecalled_tags) gold_tag_map = { self.CPOS.vocab[k]: v for k, v in metric.gold_tag_map.items() } pprint(gold_tag_map) unrecalled_tag_map = { g: tag_map[gold_tag_map[g]] for g in self.CPOS.vocab.stoi } unrecalled_tag_map = { k: v for k, v in unrecalled_tag_map.items() if k != v } pprint(unrecalled_tag_map) # heatmap(metric.clusters.cpu(), list(self.CPOS.vocab.stoi.keys()), f"{args.path}.evaluate.clusters") heatmap( self.model.T.softmax(-1).detach().cpu(), [f"#C{n}#" for n in range(len(self.CPOS.vocab))], f"{args.path}.T.clusters") logger.info( f"{elapsed}s elapsed, {len(dataset)/elapsed.total_seconds():.2f} Sents/s" ) return loss, metric
def evaluate(self, data, buckets=8, batch_size=5000, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Evaluating the dataset") start = datetime.now() loss, metric = self._evaluate(dataset.loader) elapsed = datetime.now() - start logger.info(f"loss: {loss:.4f} - {metric}") logger.info(f"{elapsed}s elapsed, {len(dataset)/elapsed.total_seconds():.2f} Sents/s") return loss, metric
def build(cls, path, min_freq=2, fix_len=20, **kwargs): """ Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. Returns: The created parser. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Build the fields") WORD = Field('words', pad=pad, unk=unk, lower=True) CPOS = Field('tags') transform = CoNLL(FORM=WORD, CPOS=CPOS) train = Dataset(transform, args.train) WORD.build( train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None), not_extend_vocab=True) # WORD.build(train, args.min_freq) CPOS.build(train) args.update({ 'n_words': len(WORD.vocab), 'n_cpos': len(CPOS.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, }) model = cls.MODEL(normalize_paras=not args.em_alg, **args) if args.em_alg: model.requires_grad_(False) # model.load_pretrained(WORD.embed).to(args.device) model.to(args.device) return cls(args, model, transform)
def train(self, train, dev, test, buckets=32, batch_size=5000, lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, epochs=5000, patience=100, weight_decay=0, verbose=True, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() if dist.is_initialized(): args.batch_size = args.batch_size // dist.get_world_size() logger.info("Loading the data") train = Dataset(self.transform, args.train, **args) dev = Dataset(self.transform, args.dev) test = Dataset(self.transform, args.test) train.build(args.batch_size, args.buckets, True, dist.is_initialized()) dev.build(args.batch_size, args.buckets) test.build(args.batch_size, args.buckets) logger.info(f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") logger.info(f"{self.model}\n") if dist.is_initialized(): self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True) self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.epsilon, weight_decay=args.weight_decay) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1/args.decay_steps)) elapsed = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") self._train(train.loader) loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric if is_master(): self.save(args.path) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") elapsed += t if epoch - best_e >= args.patience: break loss, metric = self.load(**args)._evaluate(test.loader) logger.info(f"Epoch {best_e} saved") logger.info(f"{'dev:':6} - {best_metric}") logger.info(f"{'test:':6} - {metric}") logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
def build(cls, path, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path) or './', exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=PAD, unk=UNK, bos=BOS, eos=EOS, lower=True) TAG, CHAR, ELMO, BERT = None, None, None, None if args.encoder == 'bert': from transformers import (AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast) t = AutoTokenizer.from_pretrained(args.bert) WORD = SubwordField( 'words', pad=t.pad_token, unk=t.unk_token, bos=t.cls_token or t.cls_token, eos=t.sep_token or t.sep_token, fix_len=args.fix_len, tokenize=t.tokenize, fn=None if not isinstance(t, (GPT2Tokenizer, GPT2TokenizerFast)) else lambda x: ' ' + x) WORD.vocab = t.get_vocab() else: WORD = Field('words', pad=PAD, unk=UNK, bos=BOS, eos=EOS, lower=True) if 'tag' in args.feat: TAG = Field('tags', bos=BOS, eos=EOS) if 'char' in args.feat: CHAR = SubwordField('chars', pad=PAD, unk=UNK, bos=BOS, eos=EOS, fix_len=args.fix_len) if 'elmo' in args.feat: from allennlp.modules.elmo import batch_to_ids ELMO = RawField('elmo') ELMO.compose = lambda x: batch_to_ids(x).to(WORD.device) if 'bert' in args.feat: from transformers import (AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast) t = AutoTokenizer.from_pretrained(args.bert) BERT = SubwordField( 'bert', pad=t.pad_token, unk=t.unk_token, bos=t.cls_token or t.cls_token, eos=t.sep_token or t.sep_token, fix_len=args.fix_len, tokenize=t.tokenize, fn=None if not isinstance(t, (GPT2Tokenizer, GPT2TokenizerFast)) else lambda x: ' ' + x) BERT.vocab = t.get_vocab() TREE = RawField('trees') CHART = ChartField('charts') transform = Tree(WORD=(WORD, CHAR, ELMO, BERT), POS=TAG, TREE=TREE, CHART=CHART) train = Dataset(transform, args.train) if args.encoder != 'bert': WORD.build( train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) if TAG is not None: TAG.build(train) if CHAR is not None: CHAR.build(train) CHART.build(train) args.update({ 'n_words': len(WORD.vocab) if args.encoder == 'bert' else WORD.vocab.n_init, 'n_labels': len(CHART.vocab), 'n_tags': len(TAG.vocab) if TAG is not None else None, 'n_chars': len(CHAR.vocab) if CHAR is not None else None, 'char_pad_index': CHAR.pad_index if CHAR is not None else None, 'bert_pad_index': BERT.pad_index if BERT is not None else None, 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'eos_index': WORD.eos_index }) logger.info(f"{transform}") logger.info("Building the model") model = cls.MODEL(**args).load_pretrained( WORD.embed if hasattr(WORD, 'embed') else None).to(args.device) logger.info(f"{model}\n") return cls(args, model, transform)
def build(cls, path, optimizer_args={'lr': 2e-3, 'betas': (.9, .9), 'eps': 1e-12}, scheduler_args={'gamma': .75**(1/5000)}, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. optimizer_args (dict): Arguments for creating an optimizer. scheduler_args (dict): Arguments for creating a scheduler. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len) elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.bos_token or tokenizer.cls_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() else: FEAT = Field('tags', bos=bos) ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs) SIB = Field('sibs', bos=bos, use_vocab=False, fn=CoNLL.get_sibs) REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): transform = CoNLL(FORM=(WORD, FEAT), HEAD=(ARC, SIB), DEPREL=REL) else: transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=(ARC, SIB), DEPREL=REL) train = Dataset(transform, args.train) WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) FEAT.build(train) REL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_rels': len(REL.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'feat_pad_index': FEAT.pad_index }) logger.info(f"{transform}") logger.info("Building the model") model = cls.MODEL(**args).load_pretrained(WORD.embed).to(args.device) logger.info(f"{model}\n") optimizer = Adam(model.parameters(), **optimizer_args) scheduler = ExponentialLR(optimizer, **scheduler_args) return cls(args, model, transform, optimizer, scheduler)
def build(cls, path, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len) elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.bos_token or tokenizer.cls_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() elif args.feat == 'elmo': logger.info("Hello, initing ElmoField") FEAT = ElmoField('elmo', bos=bos) # else: FEAT = Field('tags', bos=bos) ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs) REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL) elif args.feat == 'elmo': logger.info("calling CoNLL transform") # FEAT ima se kar 3 layerje, to bo za popravit nekak transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL) else: transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=ARC, DEPREL=REL) logger.info("initing train Dataset") train = Dataset(transform, args.train) #WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) logger.info("Building WORD, FEAT, REL fields") WORD.build(train) FEAT.build(train) REL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_rels': len(REL.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'feat_pad_index': FEAT.pad_index, }) logger.info("Loading model") model = cls.MODEL(**args) model.load_pretrained(WORD.embed).to(args.device) return cls(args, model, transform)
def train(self, train, dev, test, buckets=32, batch_size=5000, update_steps=1, clip=5.0, epochs=5000, patience=100, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() if dist.is_initialized(): args.batch_size = args.batch_size // dist.get_world_size() logger.info("Loading the data") train = Dataset(self.transform, args.train, **args) dev = Dataset(self.transform, args.dev) test = Dataset(self.transform, args.test) train.build(args.batch_size // args.update_steps, args.buckets, True, dist.is_initialized()) dev.build(args.batch_size, args.buckets) test.build(args.batch_size, args.buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") if args.encoder == 'lstm': self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.eps, args.weight_decay) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) else: from transformers import AdamW, get_linear_schedule_with_warmup steps = len(train.loader) * epochs // args.update_steps self.optimizer = AdamW( [{ 'params': c.parameters(), 'lr': args.lr * (1 if n == 'encoder' else args.lr_rate) } for n, c in self.model.named_children()], args.lr) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, int(steps * args.warmup), steps) if dist.is_initialized(): self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True) elapsed = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") self._train(train.loader) loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':5} loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':5} loss: {loss:.4f} - {test_metric}") t = datetime.now() - start if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric if is_master(): self.save(args.path) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") elapsed += t if epoch - best_e >= args.patience: break loss, metric = self.load(**args)._evaluate(test.loader) logger.info(f"Epoch {best_e} saved") logger.info(f"{'dev:':5} {best_metric}") logger.info(f"{'test:':5} {metric}") logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
def build(cls, path, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: # 加载已有模型 parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, fix_len=args.fix_len) # 怎么用bert,学习一下 elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() else: FEAT = Field('tags') EDGE = ChartField('edges', use_vocab=False, fn=CoNLL.get_edges) LABEL = ChartField('labels', fn=CoNLL.get_labels) # 对于图的边和标签的抽取,定义了新的field if args.feat in ('char', 'bert'): transform = CoNLL(FORM=(WORD, FEAT), PHEAD=(EDGE, LABEL)) else: transform = CoNLL(FORM=WORD, POS=FEAT, PHEAD=(EDGE, LABEL)) train = Dataset(transform, args.train) WORD.build( train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) FEAT.build(train) LABEL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_labels': len(LABEL.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'feat_pad_index': FEAT.pad_index }) model = cls.MODEL(**args) model.load_pretrained(WORD.embed).to(args.device) return cls(args, model, transform)
def train(self, train, dev, test, buckets=32, batch_size=5000, clip=5.0, epochs=5000, patience=100, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() if dist.is_initialized(): args.batch_size = args.batch_size // dist.get_world_size() logger.info("Loading the data") train = Dataset(self.transform, args.train, **args) dev = Dataset(self.transform, args.dev) test = Dataset(self.transform, args.test) train.build(args.batch_size, args.buckets, True, dist.is_initialized()) dev.build(args.batch_size, args.buckets) test.build(args.batch_size, args.buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") if dist.is_initialized(): self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True) elapsed = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") #if epoch < 2: # self._train(train.loader) #else: #print('Using margin loss') self._train(train.loader, loss_type='margin') loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':5} loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':5} loss: {loss:.4f} - {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric if is_master(): self.save(args.path) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") elapsed += t if epoch - best_e >= args.patience: break loss, metric = self.load(**args)._evaluate(test.loader) logger.info(f"Epoch {best_e} saved") logger.info(f"{'dev:':5} {best_metric}") logger.info(f"{'test:':5} {metric}") logger.info(f"{elapsed}s elapsed, {elapsed / epoch}s/epoch")
def build(cls, path, min_freq=2, fix_len=20, **kwargs): """ Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. Returns: The created parser. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Build the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, eos=eos, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, eos=eos, fix_len=args.fix_len) elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.cls_token or tokenizer.cls_token, eos=tokenizer.sep_token or tokenizer.sep_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() else: FEAT = Field('tags', bos=bos, eos=eos) TREE = RawField('trees') CHART = ChartField('charts') if args.feat in ('char', 'bert'): transform = Tree(WORD=(WORD, FEAT), TREE=TREE, CHART=CHART) else: transform = Tree(WORD=WORD, POS=FEAT, TREE=TREE, CHART=CHART) train = Dataset(transform, args.train) WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) FEAT.build(train) CHART.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_labels': len(CHART.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'eos_index': WORD.eos_index, 'feat_pad_index': FEAT.pad_index }) model = cls.MODEL(**args) model.load_pretrained(WORD.embed).to(args.device) return cls(args, model, transform)
def build(cls, path, min_freq=2, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default: 2. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path) or './', exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") TAG, CHAR, BERT = None, None, None if args.encoder != 'lstm': from transformers import (AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast) t = AutoTokenizer.from_pretrained(args.bert) WORD = SubwordField('words', pad=t.pad_token, unk=t.unk_token, bos=t.bos_token or t.cls_token, fix_len=args.fix_len, tokenize=t.tokenize, fn=None if not isinstance(t, (GPT2Tokenizer, GPT2TokenizerFast)) else lambda x: ' '+x) WORD.vocab = t.get_vocab() else: WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if 'tag' in args.feat: TAG = Field('tags', bos=bos) if 'char' in args.feat: CHAR = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len) if 'bert' in args.feat: from transformers import (AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast) t = AutoTokenizer.from_pretrained(args.bert) BERT = SubwordField('bert', pad=t.pad_token, unk=t.unk_token, bos=t.bos_token or t.cls_token, fix_len=args.fix_len, tokenize=t.tokenize, fn=None if not isinstance(t, (GPT2Tokenizer, GPT2TokenizerFast)) else lambda x: ' '+x) BERT.vocab = t.get_vocab() TEXT = RawField('texts') ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs) SIB = ChartField('sibs', bos=bos, use_vocab=False, fn=CoNLL.get_sibs) REL = Field('rels', bos=bos) transform = CoNLL(FORM=(WORD, TEXT, CHAR, BERT), CPOS=TAG, HEAD=(ARC, SIB), DEPREL=REL) train = Dataset(transform, args.train) if args.encoder == 'lstm': WORD.build(train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) if TAG is not None: TAG.build(train) if CHAR is not None: CHAR.build(train) REL.build(train) args.update({ 'n_words': len(WORD.vocab) if args.encoder != 'lstm' else WORD.vocab.n_init, 'n_rels': len(REL.vocab), 'n_tags': len(TAG.vocab) if TAG is not None else None, 'n_chars': len(CHAR.vocab) if CHAR is not None else None, 'char_pad_index': CHAR.pad_index if CHAR is not None else None, 'bert_pad_index': BERT.pad_index if BERT is not None else None, 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index }) logger.info(f"{transform}") logger.info("Building the model") model = cls.MODEL(**args).load_pretrained(WORD.embed if hasattr(WORD, 'embed') else None).to(args.device) logger.info(f"{model}\n") return cls(args, model, transform)
def build(cls, path, optimizer_args={ 'lr': 1e-3, 'betas': (.0, .95), 'eps': 1e-12, 'weight_decay': 3e-9 }, scheduler_args={'gamma': .75**(1 / 5000)}, min_freq=7, fix_len=20, **kwargs): r""" Build a brand-new Parser, including initialization of all data fields and model parameters. Args: path (str): The path of the model to be saved. optimizer_args (dict): Arguments for creating an optimizer. scheduler_args (dict): Arguments for creating a scheduler. min_freq (str): The minimum frequency needed to include a token in the vocabulary. Default:7. fix_len (int): The max length of all subword pieces. The excess part of each piece will be truncated. Required if using CharLSTM/BERT. Default: 20. kwargs (dict): A dict holding the unconsumed arguments. """ args = Config(**locals()) args.device = 'cuda' if torch.cuda.is_available() else 'cpu' os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.exists(path) and not args.build: parser = cls.load(**args) parser.model = cls.MODEL(**parser.args) parser.model.load_pretrained(parser.WORD.embed).to(args.device) return parser logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) TAG, CHAR, LEMMA, BERT = None, None, None, None if 'tag' in args.feat: TAG = Field('tags', bos=bos) if 'char' in args.feat: CHAR = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=args.fix_len) if 'lemma' in args.feat: LEMMA = Field('lemmas', pad=pad, unk=unk, bos=bos, lower=True) if 'bert' in args.feat: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) BERT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.bos_token or tokenizer.cls_token, fix_len=args.fix_len, tokenize=tokenizer.tokenize) BERT.vocab = tokenizer.get_vocab() EDGE = ChartField('edges', use_vocab=False, fn=CoNLL.get_edges) LABEL = ChartField('labels', fn=CoNLL.get_labels) transform = CoNLL(FORM=(WORD, CHAR, BERT), LEMMA=LEMMA, POS=TAG, PHEAD=(EDGE, LABEL)) train = Dataset(transform, args.train) WORD.build( train, args.min_freq, (Embedding.load(args.embed, args.unk) if args.embed else None)) if TAG is not None: TAG.build(train) if CHAR is not None: CHAR.build(train) if LEMMA is not None: LEMMA.build(train) LABEL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_labels': len(LABEL.vocab), 'n_tags': len(TAG.vocab) if TAG is not None else None, 'n_chars': len(CHAR.vocab) if CHAR is not None else None, 'char_pad_index': CHAR.pad_index if CHAR is not None else None, 'n_lemmas': len(LEMMA.vocab) if LEMMA is not None else None, 'bert_pad_index': BERT.pad_index if BERT is not None else None, 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index }) logger.info(f"{transform}") logger.info("Building the model") model = cls.MODEL(**args).load_pretrained(WORD.embed).to(args.device) logger.info(f"{model}\n") optimizer = Adam(model.parameters(), **optimizer_args) scheduler = ExponentialLR(optimizer, **scheduler_args) return cls(args, model, transform, optimizer, scheduler)
def train(self, train, dev, test, buckets=32, batch_size=5000, update_steps=1, clip=5.0, epochs=5000, patience=100, **kwargs): args = self.args.update(locals()) init_logger(logger, verbose=args.verbose) self.transform.train() batch_size = batch_size // update_steps if dist.is_initialized(): batch_size = batch_size // dist.get_world_size() logger.info("Loading the data") train = Dataset(self.transform, args.train, **args).build(batch_size, buckets, True, dist.is_initialized()) dev = Dataset(self.transform, args.dev).build(batch_size, buckets) test = Dataset(self.transform, args.test).build(batch_size, buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") if args.encoder == 'lstm': self.optimizer = Adam(self.model.parameters(), args.lr, (args.mu, args.nu), args.eps, args.weight_decay) self.scheduler = ExponentialLR(self.optimizer, args.decay**(1 / args.decay_steps)) else: from transformers import AdamW, get_linear_schedule_with_warmup steps = len(train.loader) * epochs // args.update_steps self.optimizer = AdamW([{ 'params': p, 'lr': args.lr * (1 if n.startswith('encoder') else args.lr_rate) } for n, p in self.model.named_parameters()], args.lr) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, int(steps * args.warmup), steps) if dist.is_initialized(): self.model = DDP(self.model, device_ids=[args.local_rank], find_unused_parameters=True) self.epoch, self.best_e, self.patience, self.best_metric, self.elapsed = 1, 1, patience, Metric( ), timedelta() if self.args.checkpoint: self.optimizer.load_state_dict( self.checkpoint_state_dict.pop('optimizer_state_dict')) self.scheduler.load_state_dict( self.checkpoint_state_dict.pop('scheduler_state_dict')) set_rng_state(self.checkpoint_state_dict.pop('rng_state')) for k, v in self.checkpoint_state_dict.items(): setattr(self, k, v) train.loader.batch_sampler.epoch = self.epoch for epoch in range(self.epoch, args.epochs + 1): start = datetime.now() logger.info(f"Epoch {epoch} / {args.epochs}:") self._train(train.loader) loss, dev_metric = self._evaluate(dev.loader) logger.info(f"{'dev:':5} loss: {loss:.4f} - {dev_metric}") loss, test_metric = self._evaluate(test.loader) logger.info(f"{'test:':5} loss: {loss:.4f} - {test_metric}") t = datetime.now() - start self.epoch += 1 self.patience -= 1 self.elapsed += t if dev_metric > self.best_metric: self.best_e, self.patience, self.best_metric = epoch, patience, dev_metric if is_master(): self.save_checkpoint(args.path) logger.info(f"{t}s elapsed (saved)\n") else: logger.info(f"{t}s elapsed\n") if self.patience < 1: break parser = self.load(**args) loss, metric = parser._evaluate(test.loader) parser.save(args.path) logger.info(f"Epoch {self.best_e} saved") logger.info(f"{'dev:':5} {self.best_metric}") logger.info(f"{'test:':5} {metric}") logger.info(f"{self.elapsed}s elapsed, {self.elapsed / epoch}s/epoch")