def load_vocabs(self, save_dir, filename='vocabs.json'): """Load vocabularies from a directory. Args: save_dir: The directory to load vocabularies. filename: The name for vocabularies. """ if hasattr(self, 'vocabs'): self.vocabs = VocabDict() self.vocabs.load_vocabs(save_dir, filename)
def __init__(self, **kwargs) -> None: """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs, datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced protocols, which means subclass has the freedom to override or completely skip some steps. Args: **kwargs: Addtional arguments to be stored in the ``config`` property. """ super().__init__() self.model: Optional[torch.nn.Module] = None self.config = SerializableDict(**kwargs) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=False, apply_constraint=True, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, pad_rel=None, joint=True, mu=.9, nu=.9, epsilon=1e-12, cls_is_bos=True, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def tensorize(raw_batch: Dict[str, Any], vocabs: VocabDict, pad_dict: Dict[str, int] = None, device=None): for field, data in raw_batch.items(): if isinstance(data, torch.Tensor): continue vocab_key = field[:-len('_id')] if field.endswith('_id') else None vocab: Vocab = vocabs.get(vocab_key, None) if vocabs and vocab_key else None if vocab: pad = vocab.safe_pad_token_idx dtype = torch.long elif pad_dict is not None and field in pad_dict: pad = pad_dict[field] dtype = dtype_of(pad) elif field.endswith('_offset') or field.endswith('_id') or field.endswith( '_count') or field.endswith('_ids') or field.endswith('_score') or field.endswith( '_length') or field.endswith('_span'): # guess some common fields to pad pad = 0 dtype = torch.long elif field.endswith('_mask'): pad = False dtype = torch.bool else: # no need to pad continue data = PadSequenceDataLoader.pad_data(data, pad, dtype) raw_batch[field] = data if device is not None: for field, data in raw_batch.items(): if isinstance(data, torch.Tensor): data = data.to(device) raw_batch[field] = data return raw_batch
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=True, pad_rel=None, apply_constraint=False, single_root=True, no_zero_head=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, cls_is_bos=True, use_pos=False, **kwargs) -> None: r"""Implementation of "Stanford's graph-based neural dependency parser at the conll 2017 shared task" (:cite:`dozat2017stanford`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. punct: ``True`` to include punctuations in evaluation. pad_rel: Padding token for relations. apply_constraint: Enforce constraints (see following parameters). single_root: Force single root. no_zero_head: Every token has at least one head. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. cls_is_bos: ``True`` to treat the first token as ``BOS``. use_pos: Use pos feature. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, lexical_dropout=0.5, dropout=0.2, span_width_feature_size=20, ffnn_size=150, ffnn_depth=2, argument_ratio=0.8, predicate_ratio=0.4, max_arg_width=30, mlp_label_size=100, enforce_srl_constraint=False, use_gold_predicates=False, doc_level_offset=True, use_biaffine=False, loss_reduction='mean', with_argument=' ', **kwargs) -> None: r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling" (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. lexical_dropout: Dropout applied to hidden states of encoder. dropout: Dropout used for other layers except the encoder. span_width_feature_size: Span width feature size. ffnn_size: Feedforward size. ffnn_depth: Number of layers of feedforward MLPs. argument_ratio: Ratio of candidate arguments over number of tokens. predicate_ratio: Ratio of candidate predicates over number of tokens. max_arg_width: Maximum argument width. mlp_label_size: Feature size for label representation. enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.). use_gold_predicates: Use gold predicates instead of predicting them. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction. loss_reduction: The loss reduction used in aggregating losses. with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, crf=False, token_key='token', dict_tags: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None, **kwargs) -> None: """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags`` to perform ``longest-prefix-matching`` which replaces matched tokens with given tags. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() self.dict_tags = dict_tags
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, punct=False, tree=False, proj=False, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, use_pos=False, max_seq_len=None, **kwargs) -> None: """Biaffine dependency parsing (:cite:`dozat:17a`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. punct: ``True`` to include punctuations in evaluation. tree: ``True`` to enforce tree constraint. proj: ``True`` for projective parsing. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. use_pos: Use pos feature. max_seq_len: Prune samples longer than this length. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=True, delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, tagging_scheme='BMES', crf=False, token_key='token', dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, **kwargs) -> None: """Tokenization which casts a chunking problem into a tagging problem. This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task and later tasks might need them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. delimiter: Delimiter used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. tagging_scheme: Either ``BMES`` or ``BI``. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=( 'self', 'kwargs', '__class__', 'dict_force', 'dict_combine'))) # avoid to config self.transform = transform self.vocabs = VocabDict() self.dict_force = dict_force self.dict_combine = dict_combine
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=True, sep_is_eos=True, delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'), equal=(('ADVP', 'PRT'), ), mbr=True, n_mlp_span=500, n_mlp_label=100, mlp_dropout=.33, no_subcategory=True, **kwargs) -> None: r"""Two-stage CRF Parsing (:cite:`ijcai2020-560`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. delete: Constituencies to be deleted from training and evaluation. equal: Constituencies that are regarded as equal during evaluation. mbr: ``True`` to enable Minimum Bayes Risk (MBR) decoding (:cite:`smith-smith-2007-probabilistic`). n_mlp_span: Number of features for span decoder. n_mlp_label: Number of features for label decoder. mlp_dropout: Dropout applied to MLPs. no_subcategory: Strip out subcategories. **kwargs: Not used. """ if isinstance(equal, tuple): equal = dict(equal) super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, crf=False, token_key='token', **kwargs) -> None: """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for any tagging tasks including PoS tagging and many others. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, n_mlp_arc=768, n_mlp_rel=256, mlp_dropout=.33, tree=False, proj=False, punct=False, max_seq_len=None, **kwargs) -> None: r"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. tree: ``True`` to enforce tree constraint. proj: ``True`` for projective parsing. punct: ``True`` to include punctuations in evaluation. max_seq_len: Prune samples longer than this length. Useful for reducing GPU consumption. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, crf=False, n_mlp_rel=300, mlp_dropout=0.2, loss_reduction='mean', doc_level_offset=True, **kwargs) -> None: """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). n_mlp_rel: Output size of MLPs for representing predicate and tokens. mlp_dropout: Dropout applied to MLPs. loss_reduction: Loss reduction for aggregating losses. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=None, separate_optimizer=False, doc_level_offset=True, is_flat_ner=True, tagset=None, ret_tokens=' ', ffnn_size=150, loss_reduction='mean', **kwargs) -> None: """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no assumption about the spans, it naturally supports flat NER and nested NER. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. is_flat_ner: ``True`` for flat NER, otherwise nested NER. tagset: Optional tagset to prune entities outside of this tagset from datasets. ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt. ffnn_size: Feedforward size for MLPs extracting the head/tail representations. loss_reduction: The loss reduction used in aggregating losses. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, char2concept_dim=128, cnn_filters=((3, 256), ), concept_char_dim=32, concept_dim=300, dropout=0.2, embed_dim=512, eval_every=20, ff_embed_dim=1024, graph_layers=2, inference_layers=4, num_heads=8, rel_dim=100, snt_layers=4, unk_rate=0.33, vocab_min_freq=5, beam_size=8, alpha=0.6, max_time_step=100, amr_version='2.0', **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() utils_dir = get_resource(get_amr_utils(amr_version)) self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
class TorchComponent(Component, ABC): def __init__(self, **kwargs) -> None: """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs, datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced protocols, which means subclass has the freedom to override or completely skip some steps. Args: **kwargs: Addtional arguments to be stored in the ``config`` property. """ super().__init__() self.model: Optional[torch.nn.Module] = None self.config = SerializableDict(**kwargs) self.vocabs = VocabDict() def _capture_config(self, locals_: Dict, exclude=( 'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose', 'dev_batch_size', '__class__', 'devices', 'eval_trn')): """Save arguments to config Args: locals_: Dict: exclude: (Default value = ('trn_data') 'dev_data': 'save_dir': 'kwargs': 'self': 'logger': 'verbose': 'dev_batch_size': '__class__': 'devices'): Returns: """ if 'kwargs' in locals_: locals_.update(locals_['kwargs']) locals_ = dict((k, v) for k, v in locals_.items() if k not in exclude and not k.startswith('_')) self.config.update(locals_) return self.config def save_weights(self, save_dir, filename='model.pt', trainable_only=True, **kwargs): """Save model weights to a directory. Args: save_dir: The directory to save weights into. filename: A file name for weights. trainable_only: ``True`` to only save trainable weights. Useful when the model contains lots of static embeddings. **kwargs: Not used for now. """ model = self.model_ state_dict = model.state_dict() if trainable_only: trainable_names = set(n for n, p in model.named_parameters() if p.requires_grad) state_dict = dict((n, p) for n, p in state_dict.items() if n in trainable_names) torch.save(state_dict, os.path.join(save_dir, filename)) def load_weights(self, save_dir, filename='model.pt', **kwargs): """Load weights from a directory. Args: save_dir: The directory to load weights from. filename: A file name for weights. **kwargs: Not used. """ save_dir = get_resource(save_dir) filename = os.path.join(save_dir, filename) # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]') self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False) # flash('') def save_config(self, save_dir, filename='config.json'): """Save config into a directory. Args: save_dir: The directory to save config. filename: A file name for config. """ self._savable_config.save_json(os.path.join(save_dir, filename)) def load_config(self, save_dir, filename='config.json', **kwargs): """Load config from a directory. Args: save_dir: The directory to load config. filename: A file name for config. **kwargs: K-V pairs to override config. """ save_dir = get_resource(save_dir) self.config.load_json(os.path.join(save_dir, filename)) self.config.update(kwargs) # overwrite config loaded from disk for k, v in self.config.items(): if isinstance(v, dict) and 'classpath' in v: self.config[k] = Configurable.from_config(v) self.on_config_ready(**self.config) def save_vocabs(self, save_dir, filename='vocabs.json'): """Save vocabularies to a directory. Args: save_dir: The directory to save vocabularies. filename: The name for vocabularies. """ if hasattr(self, 'vocabs'): self.vocabs.save_vocabs(save_dir, filename) def load_vocabs(self, save_dir, filename='vocabs.json'): """Load vocabularies from a directory. Args: save_dir: The directory to load vocabularies. filename: The name for vocabularies. """ if hasattr(self, 'vocabs'): self.vocabs = VocabDict() self.vocabs.load_vocabs(save_dir, filename) def save(self, save_dir: str, **kwargs): """Save this component to a directory. Args: save_dir: The directory to save this component. **kwargs: Not used. """ self.save_config(save_dir) self.save_vocabs(save_dir) self.save_weights(save_dir) def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs): """Load from a local/remote component. Args: save_dir: An identifier which can be a local path or a remote URL or a pre-defined string. devices: The devices this component will be moved onto. verbose: ``True`` to log loading progress. **kwargs: To override some configs. """ save_dir = get_resource(save_dir) # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]') if devices is None and self.model: devices = self.devices self.load_config(save_dir, **kwargs) self.load_vocabs(save_dir) if verbose: flash('Building model [blink][yellow]...[/yellow][/blink]') self.model = self.build_model( **merge_dict(self.config, training=False, **kwargs, overwrite=True, inplace=True)) if verbose: flash('') self.load_weights(save_dir, **kwargs) self.to(devices) self.model.eval() def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, devices=None, logger=None, seed=None, finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs): """Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote files. Args: trn_data: Training set. dev_data: Development set. save_dir: The directory to save trained component. batch_size: The number of samples in a batch. epochs: Number of epochs. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str`` to specify a different ``save_dir`` to load from. eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick diagnostic for debugging. _device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so other components won't take these devices as first choices. **kwargs: Hyperparameters used by sub-classes. Returns: Any results sub-classes would like to return. Usually the best metrics on training set. """ # Common initialization steps config = self._capture_config(locals()) if not logger: logger = self.build_logger('train', save_dir) if not seed: self.config.seed = 233 if isdebugging() else int(time.time()) set_seed(self.config.seed) logger.info(self._savable_config.to_json(sort=True)) if isinstance(devices, list) or devices is None or isinstance(devices, float): flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]') devices = -1 if isdebugging() else cuda_devices(devices) flash('') # flash(f'Available GPUs: {devices}') if isinstance(devices, list): first_device = (devices[0] if devices else -1) elif isinstance(devices, dict): first_device = next(iter(devices.values())) elif isinstance(devices, int): first_device = devices else: first_device = -1 if _device_placeholder and first_device >= 0: _dummy_placeholder = self._create_dummy_placeholder_on(first_device) if finetune: if isinstance(finetune, str): self.load(finetune, devices=devices) else: self.load(save_dir, devices=devices) logger.info( f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}' f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.') self.on_config_ready(**self.config) trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True, training=True, device=first_device, logger=logger, vocabs=self.vocabs, overwrite=True)) dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False, training=None, device=first_device, logger=logger, vocabs=self.vocabs, overwrite=True)) if dev_data else None if not finetune: flash('[yellow]Building model [blink]...[/blink][/yellow]') self.model = self.build_model(**merge_dict(config, training=True)) flash('') logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}' f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.') assert self.model, 'build_model is not properly implemented.' _description = repr(self.model) if len(_description.split('\n')) < 10: logger.info(_description) self.save_config(save_dir) self.save_vocabs(save_dir) self.to(devices, logger) if _device_placeholder and first_device >= 0: del _dummy_placeholder criterion = self.build_criterion(**merge_dict(config, trn=trn)) optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion)) metric = self.build_metric(**self.config) if hasattr(trn.dataset, '__len__') and dev and hasattr(dev.dataset, '__len__'): logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.') trn_size = len(trn) // self.config.get('gradient_accumulation', 1) ratio_width = len(f'{trn_size}/{trn_size}') else: ratio_width = None return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion, optimizer=optimizer, metric=metric, logger=logger, save_dir=save_dir, devices=devices, ratio_width=ratio_width, trn_data=trn_data, dev_data=dev_data, eval_trn=eval_trn, overwrite=True)) def build_logger(self, name, save_dir): """Build a :class:`logging.Logger`. Args: name: The name of this logger. save_dir: The directory this logger should save logs into. Returns: logging.Logger: A logger. """ logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO, fmt="%(message)s") return logger @abstractmethod def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: """Build dataloader for training, dev and test sets. It's suggested to build vocabs in this method if they are not built yet. Args: data: Data representing samples, which can be a path or a list of samples. batch_size: Number of samples per batch. shuffle: Whether to shuffle this dataloader. device: Device tensors should be loaded onto. logger: Logger for reporting some message if dataloader takes a long time or if vocabs has to be built. **kwargs: Arguments from ``**self.config``. """ pass def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger): """Override this method to build vocabs. Args: trn: Training set. logger: Logger for reporting progress. """ pass @property def _savable_config(self): def convert(k, v): if not isinstance(v, SerializableDict) and hasattr(v, 'config'): v = v.config elif isinstance(v, (set, tuple)): v = list(v) if isinstance(v, dict): v = dict(convert(_k, _v) for _k, _v in v.items()) return k, v config = SerializableDict( convert(k, v) for k, v in sorted(self.config.items())) config.update({ # 'create_time': now_datetime(), 'classpath': classpath_of(self), 'hanlp_version': hanlp.__version__, }) return config @abstractmethod def build_optimizer(self, **kwargs): """Implement this method to build an optimizer. Args: **kwargs: The subclass decides the method signature. """ pass @abstractmethod def build_criterion(self, decoder, **kwargs): """Implement this method to build criterion (loss function). Args: decoder: The model or decoder. **kwargs: The subclass decides the method signature. """ pass @abstractmethod def build_metric(self, **kwargs): """Implement this to build metric(s). Args: **kwargs: The subclass decides the method signature. """ pass @abstractmethod def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir, logger: logging.Logger, devices, ratio_width=None, **kwargs): """Implement this to run training loop. Args: trn: Training set. dev: Development set. epochs: Number of epochs. criterion: Loss function. optimizer: Optimizer(s). metric: Metric(s) save_dir: The directory to save this component. logger: Logger for reporting progress. devices: Devices this component and dataloader will live on. ratio_width: The width of dataset size measured in number of characters. Used for logger to align messages. **kwargs: Other hyper-parameters passed from sub-class. """ pass @abstractmethod def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs): """Fit onto a dataloader. Args: trn: Training set. criterion: Loss function. optimizer: Optimizer. metric: Metric(s). logger: Logger for reporting progress. **kwargs: Other hyper-parameters passed from sub-class. """ pass @abstractmethod def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs): """Evaluate on a dataloader. Args: data: Dataloader which can build from any data source. criterion: Loss function. metric: Metric(s). output: Whether to save outputs into some file. **kwargs: Not used. """ pass @abstractmethod def build_model(self, training=True, **kwargs) -> torch.nn.Module: """Build model. Args: training: ``True`` if called during training. **kwargs: ``**self.config``. """ raise NotImplementedError def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs): """Evaluate test set. Args: tst_data: Test set, which is usually a file path. save_dir: The directory to save evaluation scores or predictions. logger: Logger for reporting progress. batch_size: Batch size for test dataloader. output: Whether to save outputs into some file. **kwargs: Not used. Returns: (metric, outputs) where outputs are the return values of ``evaluate_dataloader``. """ if not self.model: raise RuntimeError('Call fit or load before evaluate.') if isinstance(tst_data, str): tst_data = get_resource(tst_data) filename = os.path.basename(tst_data) else: filename = None if output is True: output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir) if logger is None: _logger_name = basename_no_ext(filename) if filename else None logger = self.build_logger(_logger_name, save_dir) if not batch_size: batch_size = self.config.get('batch_size', 32) data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False, device=self.devices[0], logger=logger, overwrite=True)) dataset = data while dataset and hasattr(dataset, 'dataset'): dataset = dataset.dataset num_samples = len(dataset) if dataset else None if output and isinstance(dataset, TransformableDataset): def add_idx(samples): for idx, sample in enumerate(samples): if sample: sample[IDX] = idx add_idx(dataset.data) if dataset.cache: add_idx(dataset.cache) criterion = self.build_criterion(**self.config) metric = self.build_metric(**self.config) start = time.time() outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data, save_dir=save_dir, test=True, num_samples=num_samples, **merge_dict(self.config, batch_size=batch_size, metric=metric, logger=logger, **kwargs)) elapsed = time.time() - start if logger: if num_samples: logger.info(f'speed: {num_samples / elapsed:.0f} samples/second') else: logger.info(f'speed: {len(data) / elapsed:.0f} batches/second') return metric, outputs def generate_prediction_filename(self, tst_data, save_dir): assert isinstance(tst_data, str), 'tst_data has be a str in order to infer the output name' output = os.path.splitext(os.path.basename(tst_data)) output = os.path.join(save_dir, output[0] + '.pred' + output[1]) return output def to(self, devices=Union[int, float, List[int], Dict[str, Union[int, torch.device]]], logger: logging.Logger = None, verbose=HANLP_VERBOSE): """Move this component to devices. Args: devices: Target devices. logger: Logger for printing progress report, as copying a model from CPU to GPU can takes several seconds. verbose: ``True`` to print progress when logger is None. """ if devices == -1 or devices == [-1]: devices = [] elif isinstance(devices, (int, float)) or devices is None: devices = cuda_devices(devices) if devices: if logger: logger.info(f'Using GPUs: [on_blue][cyan][bold]{devices}[/bold][/cyan][/on_blue]') if isinstance(devices, list): if verbose: flash(f'Moving model to GPUs {devices} [blink][yellow]...[/yellow][/blink]') self.model = self.model.to(devices[0]) if len(devices) > 1 and not isdebugging() and not isinstance(self.model, nn.DataParallel): self.model = self.parallelize(devices) elif isinstance(devices, dict): for name, module in self.model.named_modules(): for regex, device in devices.items(): try: on_device: torch.device = next(module.parameters()).device except StopIteration: continue if on_device == device: continue if isinstance(device, int): if on_device.index == device: continue if re.match(regex, name): if not name: name = '*' flash(f'Moving module [yellow]{name}[/yellow] to [on_yellow][magenta][bold]{device}' f'[/bold][/magenta][/on_yellow]: [red]{regex}[/red]\n') module.to(device) else: raise ValueError(f'Unrecognized devices {devices}') if verbose: flash('') else: if logger: logger.info('Using CPU') def parallelize(self, devices: List[Union[int, torch.device]]): return nn.DataParallel(self.model, device_ids=devices) @property def devices(self): """The devices this component lives on. """ if self.model is None: return None # next(parser.model.parameters()).device if hasattr(self.model, 'device_ids'): return self.model.device_ids device: torch.device = next(self.model.parameters()).device return [device] @property def device(self): """The first device this component lives on. """ devices = self.devices if not devices: return None return devices[0] def on_config_ready(self, **kwargs): """Called when config is ready, either during ``fit`` ot ``load``. Subclass can perform extra initialization tasks in this callback. Args: **kwargs: Not used. """ pass @property def model_(self) -> nn.Module: """ The actual model when it's wrapped by a `DataParallel` Returns: The "real" model """ if isinstance(self.model, nn.DataParallel): return self.model.module return self.model # noinspection PyMethodOverriding @abstractmethod def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs): """Predict on data fed by user. Users shall avoid directly call this method since it is not guarded with ``torch.no_grad`` and will introduces unnecessary gradient computation. Use ``__call__`` instead. Args: data: Sentences or tokens. batch_size: Decoding batch size. **kwargs: Used in sub-classes. """ pass @staticmethod def _create_dummy_placeholder_on(device): if device < 0: device = 'cpu:0' return torch.zeros(16, 16, device=device) @torch.no_grad() def __call__(self, data, batch_size=None, **kwargs): """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates it with ``torch.no_grad``. Args: data: Sentences or tokens. batch_size: Decoding batch size. **kwargs: Used in sub-classes. """ return super().__call__(data, **merge_dict(self.config, overwrite=True, batch_size=batch_size or self.config.get('batch_size', None), **kwargs))
def batchify(data, vocabs: VocabDict, unk_rate=0., device=None, squeeze=False, tokenizer: TransformerSequenceTokenizer = None, shuffle_sibling=True, levi_graph=False, extra_arc=False, bart=False): rel_vocab: VocabWithFrequency = vocabs.rel _tok = list_to_tensor(data['token'], vocabs['token'], unk_rate=unk_rate) if 'token' in vocabs else None _lem = list_to_tensor(data['lemma'], vocabs['lemma'], unk_rate=unk_rate) _pos = list_to_tensor(data['pos'], vocabs['pos'], unk_rate=unk_rate) if 'pos' in vocabs else None _ner = list_to_tensor(data['ner'], vocabs['ner'], unk_rate=unk_rate) if 'ner' in vocabs else None _word_char = lists_of_string_to_tensor( data['token'], vocabs['word_char']) if 'word_char' in vocabs else None local_token2idx = data['token2idx'] local_idx2token = data['idx2token'] _cp_seq = list_to_tensor(data['cp_seq'], vocabs['predictable_concept'], local_token2idx) _mp_seq = list_to_tensor(data['mp_seq'], vocabs['predictable_concept'], local_token2idx) ret = copy(data) if 'amr' in data: concept, edge = [], [] for amr in data['amr']: if levi_graph == 'kahn': concept_i, edge_i = amr.to_levi(rel_vocab.get_frequency, shuffle=shuffle_sibling) else: concept_i, edge_i, _ = amr.root_centered_sort( rel_vocab.get_frequency, shuffle=shuffle_sibling) concept.append(concept_i) edge.append(edge_i) if levi_graph is True: concept_with_rel, edge_with_rel = levi_amr(concept, edge, extra_arc=extra_arc) concept = concept_with_rel edge = edge_with_rel augmented_concept = [[DUM] + x + [END] for x in concept] _concept_in = list_to_tensor(augmented_concept, vocabs.get('concept_and_rel', vocabs['concept']), unk_rate=unk_rate)[:-1] _concept_char_in = lists_of_string_to_tensor( augmented_concept, vocabs['concept_char'])[:-1] _concept_out = list_to_tensor(augmented_concept, vocabs['predictable_concept'], local_token2idx)[1:] out_conc_len, bsz = _concept_out.shape _rel = np.full((1 + out_conc_len, bsz, out_conc_len), rel_vocab.pad_idx) # v: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}, <end>] u: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}] for bidx, (x, y) in enumerate(zip(edge, concept)): for l, _ in enumerate(y): if l > 0: # l=1 => pos=l+1=2 _rel[l + 1, bidx, 1:l + 1] = rel_vocab.get_idx(NIL) for v, u, r in x: if levi_graph: r = 1 else: r = rel_vocab.get_idx(r) assert v > u, 'Invalid typological order' _rel[v + 1, bidx, u + 1] = r ret.update({ 'concept_in': _concept_in, 'concept_char_in': _concept_char_in, 'concept_out': _concept_out, 'rel': _rel }) else: augmented_concept = None token_length = ret.get('token_length', None) if token_length is not None and not isinstance(token_length, torch.Tensor): ret['token_length'] = torch.tensor( token_length, dtype=torch.long, device=device if (isinstance(device, torch.device) or device >= 0) else 'cpu:0') ret.update({ 'lem': _lem, 'tok': _tok, 'pos': _pos, 'ner': _ner, 'word_char': _word_char, 'copy_seq': np.stack([_cp_seq, _mp_seq], -1), 'local_token2idx': local_token2idx, 'local_idx2token': local_idx2token }) if squeeze: token_field = make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret) else: token_field = 'token' subtoken_to_tensor(token_field, ret) if bart: make_batch_for_bart(augmented_concept, ret, tokenizer, device) move_dict_to_device(ret, device) return ret
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, tagging_scheme=None, crf=False, delimiter_in_entity=None, merge_types: List[str] = None, secondary_encoder=None, token_key='token', dict_whitelist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_blacklist: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_tags: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None, **kwargs) -> None: r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type. During decoding, it performs longest-prefix-matching of these words to override the prediction from underlying statistical model. It also uses a blacklist to mask out mis-predicted entities. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining tokens during decoding. merge_types: The types of consecutive entities to be merged. secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden states from the main encoder as input. token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be included into the final results. dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be excluded from the final results. **kwargs: """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() self.secondary_encoder = secondary_encoder self.dict_whitelist = dict_whitelist self.dict_blacklist = dict_blacklist self.dict_tags = dict_tags