Exemplo n.º 1
0
    def load_vocabs(self, save_dir, filename='vocabs.json'):
        """Load vocabularies from a directory.

        Args:
            save_dir: The directory to load vocabularies.
            filename:  The name for vocabularies.
        """
        if hasattr(self, 'vocabs'):
            self.vocabs = VocabDict()
            self.vocabs.load_vocabs(save_dir, filename)
Exemplo n.º 2
0
    def __init__(self, **kwargs) -> None:
        """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs,
        datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced
        protocols, which means subclass has the freedom to override or completely skip some steps.

        Args:
            **kwargs: Addtional arguments to be stored in the ``config`` property.
        """
        super().__init__()
        self.model: Optional[torch.nn.Module] = None
        self.config = SerializableDict(**kwargs)
        self.vocabs = VocabDict()
Exemplo n.º 3
0
 def __init__(self,
              trn: str = None,
              dev: str = None,
              tst: str = None,
              sampler_builder: SamplerBuilder = None,
              dependencies: str = None,
              scalar_mix: ScalarMixWithDropoutBuilder = None,
              use_raw_hidden_states=False,
              lr=2e-3,
              separate_optimizer=False,
              punct=False,
              tree=False,
              apply_constraint=True,
              n_mlp_arc=500,
              n_mlp_rel=100,
              mlp_dropout=.33,
              pad_rel=None,
              joint=True,
              mu=.9,
              nu=.9,
              epsilon=1e-12,
              cls_is_bos=True,
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.vocabs = VocabDict()
Exemplo n.º 4
0
 def tensorize(raw_batch: Dict[str, Any], vocabs: VocabDict, pad_dict: Dict[str, int] = None, device=None):
     for field, data in raw_batch.items():
         if isinstance(data, torch.Tensor):
             continue
         vocab_key = field[:-len('_id')] if field.endswith('_id') else None
         vocab: Vocab = vocabs.get(vocab_key, None) if vocabs and vocab_key else None
         if vocab:
             pad = vocab.safe_pad_token_idx
             dtype = torch.long
         elif pad_dict is not None and field in pad_dict:
             pad = pad_dict[field]
             dtype = dtype_of(pad)
         elif field.endswith('_offset') or field.endswith('_id') or field.endswith(
                 '_count') or field.endswith('_ids') or field.endswith('_score') or field.endswith(
             '_length') or field.endswith('_span'):
             # guess some common fields to pad
             pad = 0
             dtype = torch.long
         elif field.endswith('_mask'):
             pad = False
             dtype = torch.bool
         else:
             # no need to pad
             continue
         data = PadSequenceDataLoader.pad_data(data, pad, dtype)
         raw_batch[field] = data
     if device is not None:
         for field, data in raw_batch.items():
             if isinstance(data, torch.Tensor):
                 data = data.to(device)
                 raw_batch[field] = data
     return raw_batch
Exemplo n.º 5
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=2e-3,
                 separate_optimizer=False,
                 punct=False,
                 tree=True,
                 pad_rel=None,
                 apply_constraint=False,
                 single_root=True,
                 no_zero_head=None,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 mu=.9,
                 nu=.9,
                 epsilon=1e-12,
                 decay=.75,
                 decay_steps=5000,
                 cls_is_bos=True,
                 use_pos=False,
                 **kwargs) -> None:
        r"""Implementation of "Stanford's graph-based neural dependency parser at
        the conll 2017 shared task" (:cite:`dozat2017stanford`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            punct: ``True`` to include punctuations in evaluation.
            pad_rel: Padding token for relations.
            apply_constraint: Enforce constraints (see following parameters).
            single_root: Force single root.
            no_zero_head: Every token has at least one head.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            mu: First coefficient used for computing running averages of gradient and its square in Adam.
            nu: Second coefficient used for computing running averages of gradient and its square in Adam.
            epsilon: Term added to the denominator to improve numerical stability
            decay: Decay rate for exceptional lr scheduler.
            decay_steps: Decay every ``decay_steps`` steps.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            use_pos: Use pos feature.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 6
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 lexical_dropout=0.5,
                 dropout=0.2,
                 span_width_feature_size=20,
                 ffnn_size=150,
                 ffnn_depth=2,
                 argument_ratio=0.8,
                 predicate_ratio=0.4,
                 max_arg_width=30,
                 mlp_label_size=100,
                 enforce_srl_constraint=False,
                 use_gold_predicates=False,
                 doc_level_offset=True,
                 use_biaffine=False,
                 loss_reduction='mean',
                 with_argument=' ',
                 **kwargs) -> None:
        r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling"
        (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            lexical_dropout: Dropout applied to hidden states of encoder.
            dropout: Dropout used for other layers except the encoder.
            span_width_feature_size: Span width feature size.
            ffnn_size: Feedforward size.
            ffnn_depth: Number of layers of feedforward MLPs.
            argument_ratio: Ratio of candidate arguments over number of tokens.
            predicate_ratio: Ratio of candidate predicates over number of tokens.
            max_arg_width: Maximum argument width.
            mlp_label_size: Feature size for label representation.
            enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.).
            use_gold_predicates: Use gold predicates instead of predicting them.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction.
            loss_reduction: The loss reduction used in aggregating losses.
            with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 7
0
Arquivo: pos.py Projeto: lei1993/HanLP
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 crf=False,
                 token_key='token',
                 dict_tags: Union[DictInterface,
                                  Union[Dict[Union[str, Sequence[str]],
                                             Union[str,
                                                   Sequence[str]]]]] = None,
                 **kwargs) -> None:
        """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags``
        to perform ``longest-prefix-matching`` which replaces matched tokens with given tags.


        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        self.dict_tags = dict_tags
Exemplo n.º 8
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=2e-3,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=False,
                 punct=False,
                 tree=False,
                 proj=False,
                 n_mlp_arc=500,
                 n_mlp_rel=100,
                 mlp_dropout=.33,
                 mu=.9,
                 nu=.9,
                 epsilon=1e-12,
                 decay=.75,
                 decay_steps=5000,
                 use_pos=False,
                 max_seq_len=None,
                 **kwargs) -> None:
        """Biaffine dependency parsing (:cite:`dozat:17a`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            punct: ``True`` to include punctuations in evaluation.
            tree: ``True`` to enforce tree constraint.
            proj: ``True`` for projective parsing.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            mu: First coefficient used for computing running averages of gradient and its square in Adam.
            nu: Second coefficient used for computing running averages of gradient and its square in Adam.
            epsilon: Term added to the denominator to improve numerical stability
            decay: Decay rate for exceptional lr scheduler.
            decay_steps: Decay every ``decay_steps`` steps.
            use_pos: Use pos feature.
            max_seq_len: Prune samples longer than this length.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 9
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3, separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=True,
                 delimiter=None,
                 max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False,
                 transform=None,
                 tagging_scheme='BMES',
                 crf=False,
                 token_key='token',
                 dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None,
                 **kwargs) -> None:
        """Tokenization which casts a chunking problem into a tagging problem.
        This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task
        and later tasks might need them.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            delimiter: Delimiter used to split a line in the corpus.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            transform: An optional transform to be applied to samples. Usually a character normalization transform is
                passed in.
            tagging_scheme: Either ``BMES`` or ``BI``.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=(
            'self', 'kwargs', '__class__', 'dict_force', 'dict_combine')))  # avoid to config
        self.transform = transform
        self.vocabs = VocabDict()
        self.dict_force = dict_force
        self.dict_combine = dict_combine
Exemplo n.º 10
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=True,
                 delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP',
                         ',', 'S1'),
                 equal=(('ADVP', 'PRT'), ),
                 mbr=True,
                 n_mlp_span=500,
                 n_mlp_label=100,
                 mlp_dropout=.33,
                 no_subcategory=True,
                 **kwargs) -> None:
        r"""Two-stage CRF Parsing (:cite:`ijcai2020-560`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            delete: Constituencies to be deleted from training and evaluation.
            equal: Constituencies that are regarded as equal during evaluation.
            mbr: ``True`` to enable Minimum Bayes Risk (MBR) decoding (:cite:`smith-smith-2007-probabilistic`).
            n_mlp_span: Number of features for span decoder.
            n_mlp_label: Number of features for label decoder.
            mlp_dropout: Dropout applied to MLPs.
            no_subcategory: Strip out subcategories.
            **kwargs: Not used.
        """
        if isinstance(equal, tuple):
            equal = dict(equal)
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 11
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 crf=False,
                 token_key='token',
                 **kwargs) -> None:
        """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        any tagging tasks including PoS tagging and many others.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 12
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=True,
                 sep_is_eos=False,
                 n_mlp_arc=768,
                 n_mlp_rel=256,
                 mlp_dropout=.33,
                 tree=False,
                 proj=False,
                 punct=False,
                 max_seq_len=None,
                 **kwargs) -> None:
        r"""Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing) implementation
        of "75 Languages, 1 Model: Parsing Universal Dependencies Universally" (:cite:`kondratyuk-straka-2019-75`).

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            n_mlp_arc: Number of features for arc representation.
            n_mlp_rel: Number of features for rel representation.
            mlp_dropout: Dropout applied to MLPs.
            tree: ``True`` to enforce tree constraint.
            proj: ``True`` for projective parsing.
            punct: ``True`` to include punctuations in evaluation.
            max_seq_len: Prune samples longer than this length. Useful for reducing GPU consumption.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 13
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 crf=False,
                 n_mlp_rel=300,
                 mlp_dropout=0.2,
                 loss_reduction='mean',
                 doc_level_offset=True,
                 **kwargs) -> None:
        """A span based Semantic Role Labeling task using BIO scheme for tagging the role of each token. Given a
        predicate and a token, it uses biaffine (:cite:`dozat:17a`) to predict their relations as one of BIO-ROLE.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            cls_is_bos: ``True`` to treat the first token as ``BOS``.
            sep_is_eos: ``True`` to treat the last token as ``EOS``.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            n_mlp_rel: Output size of MLPs for representing predicate and tokens.
            mlp_dropout: Dropout applied to MLPs.
            loss_reduction: Loss reduction for aggregating losses.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 14
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=None,
                 separate_optimizer=False,
                 doc_level_offset=True,
                 is_flat_ner=True,
                 tagset=None,
                 ret_tokens=' ',
                 ffnn_size=150,
                 loss_reduction='mean',
                 **kwargs) -> None:
        """An implementation of Named Entity Recognition as Dependency Parsing (:cite:`yu-etal-2020-named`). It treats
        every possible span as a candidate of entity and predicts its entity label. Non-entity spans are assigned NULL
        label to be excluded. The label prediction is done with a biaffine layer (:cite:`dozat:17a`). As it makes no
        assumption about the spans, it naturally supports flat NER and nested NER.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level.
            is_flat_ner: ``True`` for flat NER, otherwise nested NER.
            tagset: Optional tagset to prune entities outside of this tagset from datasets.
            ret_tokens: A delimiter between tokens in entities so that the surface form of an entity can be rebuilt.
            ffnn_size: Feedforward size for MLPs extracting the head/tail representations.
            loss_reduction: The loss reduction used in aggregating losses.
            **kwargs: Not used.
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
Exemplo n.º 15
0
Arquivo: amr.py Projeto: lei1993/HanLP
 def __init__(self,
              trn: str = None,
              dev: str = None,
              tst: str = None,
              sampler_builder: SamplerBuilder = None,
              dependencies: str = None,
              scalar_mix: ScalarMixWithDropoutBuilder = None,
              use_raw_hidden_states=False,
              lr=1e-3,
              separate_optimizer=False,
              cls_is_bos=True,
              sep_is_eos=False,
              char2concept_dim=128,
              cnn_filters=((3, 256), ),
              concept_char_dim=32,
              concept_dim=300,
              dropout=0.2,
              embed_dim=512,
              eval_every=20,
              ff_embed_dim=1024,
              graph_layers=2,
              inference_layers=4,
              num_heads=8,
              rel_dim=100,
              snt_layers=4,
              unk_rate=0.33,
              vocab_min_freq=5,
              beam_size=8,
              alpha=0.6,
              max_time_step=100,
              amr_version='2.0',
              **kwargs) -> None:
     super().__init__(**merge_locals_kwargs(locals(), kwargs))
     self.vocabs = VocabDict()
     utils_dir = get_resource(get_amr_utils(amr_version))
     self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
Exemplo n.º 16
0
class TorchComponent(Component, ABC):
    def __init__(self, **kwargs) -> None:
        """The base class for all components using PyTorch as backend. It provides common workflows of building vocabs,
        datasets, dataloaders and models. These workflows are more of a conventional guideline than en-forced
        protocols, which means subclass has the freedom to override or completely skip some steps.

        Args:
            **kwargs: Addtional arguments to be stored in the ``config`` property.
        """
        super().__init__()
        self.model: Optional[torch.nn.Module] = None
        self.config = SerializableDict(**kwargs)
        self.vocabs = VocabDict()

    def _capture_config(self, locals_: Dict,
                        exclude=(
                                'trn_data', 'dev_data', 'save_dir', 'kwargs', 'self', 'logger', 'verbose',
                                'dev_batch_size', '__class__', 'devices', 'eval_trn')):
        """Save arguments to config

        Args:
          locals_: Dict: 
          exclude:  (Default value = ('trn_data')
          'dev_data': 
          'save_dir': 
          'kwargs': 
          'self': 
          'logger': 
          'verbose': 
          'dev_batch_size': 
          '__class__': 
          'devices'): 

        Returns:

        
        """
        if 'kwargs' in locals_:
            locals_.update(locals_['kwargs'])
        locals_ = dict((k, v) for k, v in locals_.items() if k not in exclude and not k.startswith('_'))
        self.config.update(locals_)
        return self.config

    def save_weights(self, save_dir, filename='model.pt', trainable_only=True, **kwargs):
        """Save model weights to a directory.

        Args:
            save_dir: The directory to save weights into.
            filename: A file name for weights.
            trainable_only: ``True`` to only save trainable weights. Useful when the model contains lots of static
                embeddings.
            **kwargs: Not used for now.
        """
        model = self.model_
        state_dict = model.state_dict()
        if trainable_only:
            trainable_names = set(n for n, p in model.named_parameters() if p.requires_grad)
            state_dict = dict((n, p) for n, p in state_dict.items() if n in trainable_names)
        torch.save(state_dict, os.path.join(save_dir, filename))

    def load_weights(self, save_dir, filename='model.pt', **kwargs):
        """Load weights from a directory.

        Args:
            save_dir: The directory to load weights from.
            filename: A file name for weights.
            **kwargs: Not used.
        """
        save_dir = get_resource(save_dir)
        filename = os.path.join(save_dir, filename)
        # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]')
        self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False)
        # flash('')

    def save_config(self, save_dir, filename='config.json'):
        """Save config into a directory.

        Args:
            save_dir: The directory to save config.
            filename: A file name for config.
        """
        self._savable_config.save_json(os.path.join(save_dir, filename))

    def load_config(self, save_dir, filename='config.json', **kwargs):
        """Load config from a directory.

        Args:
            save_dir: The directory to load config.
            filename: A file name for config.
            **kwargs: K-V pairs to override config.
        """
        save_dir = get_resource(save_dir)
        self.config.load_json(os.path.join(save_dir, filename))
        self.config.update(kwargs)  # overwrite config loaded from disk
        for k, v in self.config.items():
            if isinstance(v, dict) and 'classpath' in v:
                self.config[k] = Configurable.from_config(v)
        self.on_config_ready(**self.config)

    def save_vocabs(self, save_dir, filename='vocabs.json'):
        """Save vocabularies to a directory.

        Args:
            save_dir: The directory to save vocabularies.
            filename:  The name for vocabularies.
        """
        if hasattr(self, 'vocabs'):
            self.vocabs.save_vocabs(save_dir, filename)

    def load_vocabs(self, save_dir, filename='vocabs.json'):
        """Load vocabularies from a directory.

        Args:
            save_dir: The directory to load vocabularies.
            filename:  The name for vocabularies.
        """
        if hasattr(self, 'vocabs'):
            self.vocabs = VocabDict()
            self.vocabs.load_vocabs(save_dir, filename)

    def save(self, save_dir: str, **kwargs):
        """Save this component to a directory.

        Args:
            save_dir: The directory to save this component.
            **kwargs: Not used.
        """
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.save_weights(save_dir)

    def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs):
        """Load from a local/remote component.

        Args:
            save_dir: An identifier which can be a local path or a remote URL or a pre-defined string.
            devices: The devices this component will be moved onto.
            verbose: ``True`` to log loading progress.
            **kwargs: To override some configs.
        """
        save_dir = get_resource(save_dir)
        # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]')
        if devices is None and self.model:
            devices = self.devices
        self.load_config(save_dir, **kwargs)
        self.load_vocabs(save_dir)
        if verbose:
            flash('Building model [blink][yellow]...[/yellow][/blink]')
        self.model = self.build_model(
            **merge_dict(self.config, training=False, **kwargs, overwrite=True,
                         inplace=True))
        if verbose:
            flash('')
        self.load_weights(save_dir, **kwargs)
        self.to(devices)
        self.model.eval()

    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            batch_size,
            epochs,
            devices=None,
            logger=None,
            seed=None,
            finetune: Union[bool, str] = False,
            eval_trn=True,
            _device_placeholder=False,
            **kwargs):
        """Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote
        files.

        Args:
            trn_data: Training set.
            dev_data: Development set.
            save_dir: The directory to save trained component.
            batch_size: The number of samples in a batch.
            epochs: Number of epochs.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str``
                to specify a different ``save_dir`` to load from.
            eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick
                diagnostic for debugging.
            _device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so
                other components won't take these devices as first choices.
            **kwargs: Hyperparameters used by sub-classes.

        Returns:
            Any results sub-classes would like to return. Usually the best metrics on training set.

        """
        # Common initialization steps
        config = self._capture_config(locals())
        if not logger:
            logger = self.build_logger('train', save_dir)
        if not seed:
            self.config.seed = 233 if isdebugging() else int(time.time())
        set_seed(self.config.seed)
        logger.info(self._savable_config.to_json(sort=True))
        if isinstance(devices, list) or devices is None or isinstance(devices, float):
            flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]')
            devices = -1 if isdebugging() else cuda_devices(devices)
            flash('')
        # flash(f'Available GPUs: {devices}')
        if isinstance(devices, list):
            first_device = (devices[0] if devices else -1)
        elif isinstance(devices, dict):
            first_device = next(iter(devices.values()))
        elif isinstance(devices, int):
            first_device = devices
        else:
            first_device = -1
        if _device_placeholder and first_device >= 0:
            _dummy_placeholder = self._create_dummy_placeholder_on(first_device)
        if finetune:
            if isinstance(finetune, str):
                self.load(finetune, devices=devices)
            else:
                self.load(save_dir, devices=devices)
            logger.info(
                f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
        self.on_config_ready(**self.config)
        trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True,
                                                 training=True, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True))
        dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False,
                                                 training=None, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True)) if dev_data else None
        if not finetune:
            flash('[yellow]Building model [blink]...[/blink][/yellow]')
            self.model = self.build_model(**merge_dict(config, training=True))
            flash('')
            logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                        f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
            assert self.model, 'build_model is not properly implemented.'
        _description = repr(self.model)
        if len(_description.split('\n')) < 10:
            logger.info(_description)
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.to(devices, logger)
        if _device_placeholder and first_device >= 0:
            del _dummy_placeholder
        criterion = self.build_criterion(**merge_dict(config, trn=trn))
        optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion))
        metric = self.build_metric(**self.config)
        if hasattr(trn.dataset, '__len__') and dev and hasattr(dev.dataset, '__len__'):
            logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.')
            trn_size = len(trn) // self.config.get('gradient_accumulation', 1)
            ratio_width = len(f'{trn_size}/{trn_size}')
        else:
            ratio_width = None
        return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion,
                                                       optimizer=optimizer, metric=metric, logger=logger,
                                                       save_dir=save_dir,
                                                       devices=devices,
                                                       ratio_width=ratio_width,
                                                       trn_data=trn_data,
                                                       dev_data=dev_data,
                                                       eval_trn=eval_trn,
                                                       overwrite=True))

    def build_logger(self, name, save_dir):
        """Build a :class:`logging.Logger`.

        Args:
            name: The name of this logger.
            save_dir: The directory this logger should save logs into.

        Returns:
            logging.Logger: A logger.
        """
        logger = init_logger(name=name, root_dir=save_dir, level=logging.INFO, fmt="%(message)s")
        return logger

    @abstractmethod
    def build_dataloader(self, data, batch_size, shuffle=False, device=None, logger: logging.Logger = None,
                         **kwargs) -> DataLoader:
        """Build dataloader for training, dev and test sets. It's suggested to build vocabs in this method if they are
        not built yet.

        Args:
            data: Data representing samples, which can be a path or a list of samples.
            batch_size: Number of samples per batch.
            shuffle: Whether to shuffle this dataloader.
            device: Device tensors should be loaded onto.
            logger: Logger for reporting some message if dataloader takes a long time or if vocabs has to be built.
            **kwargs: Arguments from ``**self.config``.
        """
        pass

    def build_vocabs(self, trn: torch.utils.data.Dataset, logger: logging.Logger):
        """Override this method to build vocabs.

        Args:
            trn: Training set.
            logger: Logger for reporting progress.
        """
        pass

    @property
    def _savable_config(self):
        def convert(k, v):
            if not isinstance(v, SerializableDict) and hasattr(v, 'config'):
                v = v.config
            elif isinstance(v, (set, tuple)):
                v = list(v)
            if isinstance(v, dict):
                v = dict(convert(_k, _v) for _k, _v in v.items())
            return k, v

        config = SerializableDict(
            convert(k, v) for k, v in sorted(self.config.items()))
        config.update({
            # 'create_time': now_datetime(),
            'classpath': classpath_of(self),
            'hanlp_version': hanlp.__version__,
        })
        return config

    @abstractmethod
    def build_optimizer(self, **kwargs):
        """Implement this method to build an optimizer.

        Args:
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def build_criterion(self, decoder, **kwargs):
        """Implement this method to build criterion (loss function).

        Args:
            decoder: The model or decoder.
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def build_metric(self, **kwargs):
        """Implement this to build metric(s).

        Args:
            **kwargs: The subclass decides the method signature.
        """
        pass

    @abstractmethod
    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs, criterion, optimizer, metric, save_dir,
                              logger: logging.Logger, devices, ratio_width=None,
                              **kwargs):
        """Implement this to run training loop.

        Args:
            trn: Training set.
            dev: Development set.
            epochs: Number of epochs.
            criterion: Loss function.
            optimizer: Optimizer(s).
            metric: Metric(s)
            save_dir: The directory to save this component.
            logger: Logger for reporting progress.
            devices: Devices this component and dataloader will live on.
            ratio_width: The width of dataset size measured in number of characters. Used for logger to align messages.
            **kwargs: Other hyper-parameters passed from sub-class.
        """
        pass

    @abstractmethod
    def fit_dataloader(self, trn: DataLoader, criterion, optimizer, metric, logger: logging.Logger, **kwargs):
        """Fit onto a dataloader.

        Args:
            trn: Training set.
            criterion: Loss function.
            optimizer: Optimizer.
            metric: Metric(s).
            logger: Logger for reporting progress.
            **kwargs: Other hyper-parameters passed from sub-class.
        """
        pass

    @abstractmethod
    def evaluate_dataloader(self, data: DataLoader, criterion: Callable, metric=None, output=False, **kwargs):
        """Evaluate on a dataloader.

        Args:
            data: Dataloader which can build from any data source.
            criterion: Loss function.
            metric: Metric(s).
            output: Whether to save outputs into some file.
            **kwargs: Not used.
        """
        pass

    @abstractmethod
    def build_model(self, training=True, **kwargs) -> torch.nn.Module:
        """Build model.

        Args:
            training: ``True`` if called during training.
            **kwargs: ``**self.config``.
        """
        raise NotImplementedError

    def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
        """Evaluate test set.

        Args:
            tst_data: Test set, which is usually a file path.
            save_dir: The directory to save evaluation scores or predictions.
            logger: Logger for reporting progress.
            batch_size: Batch size for test dataloader.
            output: Whether to save outputs into some file.
            **kwargs: Not used.

        Returns:
            (metric, outputs) where outputs are the return values of ``evaluate_dataloader``.
        """
        if not self.model:
            raise RuntimeError('Call fit or load before evaluate.')
        if isinstance(tst_data, str):
            tst_data = get_resource(tst_data)
            filename = os.path.basename(tst_data)
        else:
            filename = None
        if output is True:
            output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir)
        if logger is None:
            _logger_name = basename_no_ext(filename) if filename else None
            logger = self.build_logger(_logger_name, save_dir)
        if not batch_size:
            batch_size = self.config.get('batch_size', 32)
        data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False,
                                                  device=self.devices[0], logger=logger, overwrite=True))
        dataset = data
        while dataset and hasattr(dataset, 'dataset'):
            dataset = dataset.dataset
        num_samples = len(dataset) if dataset else None
        if output and isinstance(dataset, TransformableDataset):
            def add_idx(samples):
                for idx, sample in enumerate(samples):
                    if sample:
                        sample[IDX] = idx

            add_idx(dataset.data)
            if dataset.cache:
                add_idx(dataset.cache)

        criterion = self.build_criterion(**self.config)
        metric = self.build_metric(**self.config)
        start = time.time()
        outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data,
                                           save_dir=save_dir,
                                           test=True,
                                           num_samples=num_samples,
                                           **merge_dict(self.config, batch_size=batch_size, metric=metric,
                                                        logger=logger, **kwargs))
        elapsed = time.time() - start
        if logger:
            if num_samples:
                logger.info(f'speed: {num_samples / elapsed:.0f} samples/second')
            else:
                logger.info(f'speed: {len(data) / elapsed:.0f} batches/second')
        return metric, outputs

    def generate_prediction_filename(self, tst_data, save_dir):
        assert isinstance(tst_data,
                          str), 'tst_data has be a str in order to infer the output name'
        output = os.path.splitext(os.path.basename(tst_data))
        output = os.path.join(save_dir, output[0] + '.pred' + output[1])
        return output

    def to(self,
           devices=Union[int, float, List[int], Dict[str, Union[int, torch.device]]],
           logger: logging.Logger = None, verbose=HANLP_VERBOSE):
        """Move this component to devices.

        Args:
            devices: Target devices.
            logger: Logger for printing progress report, as copying a model from CPU to GPU can takes several seconds.
            verbose: ``True`` to print progress when logger is None.
        """
        if devices == -1 or devices == [-1]:
            devices = []
        elif isinstance(devices, (int, float)) or devices is None:
            devices = cuda_devices(devices)
        if devices:
            if logger:
                logger.info(f'Using GPUs: [on_blue][cyan][bold]{devices}[/bold][/cyan][/on_blue]')
            if isinstance(devices, list):
                if verbose:
                    flash(f'Moving model to GPUs {devices} [blink][yellow]...[/yellow][/blink]')
                self.model = self.model.to(devices[0])
                if len(devices) > 1 and not isdebugging() and not isinstance(self.model, nn.DataParallel):
                    self.model = self.parallelize(devices)
            elif isinstance(devices, dict):
                for name, module in self.model.named_modules():
                    for regex, device in devices.items():
                        try:
                            on_device: torch.device = next(module.parameters()).device
                        except StopIteration:
                            continue
                        if on_device == device:
                            continue
                        if isinstance(device, int):
                            if on_device.index == device:
                                continue
                        if re.match(regex, name):
                            if not name:
                                name = '*'
                            flash(f'Moving module [yellow]{name}[/yellow] to [on_yellow][magenta][bold]{device}'
                                  f'[/bold][/magenta][/on_yellow]: [red]{regex}[/red]\n')
                            module.to(device)
            else:
                raise ValueError(f'Unrecognized devices {devices}')
            if verbose:
                flash('')
        else:
            if logger:
                logger.info('Using CPU')

    def parallelize(self, devices: List[Union[int, torch.device]]):
        return nn.DataParallel(self.model, device_ids=devices)

    @property
    def devices(self):
        """The devices this component lives on.
        """
        if self.model is None:
            return None
        # next(parser.model.parameters()).device
        if hasattr(self.model, 'device_ids'):
            return self.model.device_ids
        device: torch.device = next(self.model.parameters()).device
        return [device]

    @property
    def device(self):
        """The first device this component lives on.
        """
        devices = self.devices
        if not devices:
            return None
        return devices[0]

    def on_config_ready(self, **kwargs):
        """Called when config is ready, either during ``fit`` ot ``load``. Subclass can perform extra initialization
        tasks in this callback.

        Args:
            **kwargs: Not used.
        """
        pass

    @property
    def model_(self) -> nn.Module:
        """
        The actual model when it's wrapped by a `DataParallel`

        Returns: The "real" model

        """
        if isinstance(self.model, nn.DataParallel):
            return self.model.module
        return self.model

    # noinspection PyMethodOverriding
    @abstractmethod
    def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
        """Predict on data fed by user. Users shall avoid directly call this method since it is not guarded with
        ``torch.no_grad`` and will introduces unnecessary gradient computation. Use ``__call__`` instead.

        Args:
            data: Sentences or tokens.
            batch_size: Decoding batch size.
            **kwargs: Used in sub-classes.
        """
        pass

    @staticmethod
    def _create_dummy_placeholder_on(device):
        if device < 0:
            device = 'cpu:0'
        return torch.zeros(16, 16, device=device)

    @torch.no_grad()
    def __call__(self, data, batch_size=None, **kwargs):
        """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates
        it with ``torch.no_grad``.

        Args:
            data: Sentences or tokens.
            batch_size: Decoding batch size.
            **kwargs: Used in sub-classes.
        """
        return super().__call__(data, **merge_dict(self.config, overwrite=True,
                                                   batch_size=batch_size or self.config.get('batch_size', None),
                                                   **kwargs))
Exemplo n.º 17
0
Arquivo: amr.py Projeto: lei1993/HanLP
def batchify(data,
             vocabs: VocabDict,
             unk_rate=0.,
             device=None,
             squeeze=False,
             tokenizer: TransformerSequenceTokenizer = None,
             shuffle_sibling=True,
             levi_graph=False,
             extra_arc=False,
             bart=False):
    rel_vocab: VocabWithFrequency = vocabs.rel
    _tok = list_to_tensor(data['token'], vocabs['token'],
                          unk_rate=unk_rate) if 'token' in vocabs else None
    _lem = list_to_tensor(data['lemma'], vocabs['lemma'], unk_rate=unk_rate)
    _pos = list_to_tensor(data['pos'], vocabs['pos'],
                          unk_rate=unk_rate) if 'pos' in vocabs else None
    _ner = list_to_tensor(data['ner'], vocabs['ner'],
                          unk_rate=unk_rate) if 'ner' in vocabs else None
    _word_char = lists_of_string_to_tensor(
        data['token'], vocabs['word_char']) if 'word_char' in vocabs else None

    local_token2idx = data['token2idx']
    local_idx2token = data['idx2token']
    _cp_seq = list_to_tensor(data['cp_seq'], vocabs['predictable_concept'],
                             local_token2idx)
    _mp_seq = list_to_tensor(data['mp_seq'], vocabs['predictable_concept'],
                             local_token2idx)

    ret = copy(data)
    if 'amr' in data:
        concept, edge = [], []
        for amr in data['amr']:
            if levi_graph == 'kahn':
                concept_i, edge_i = amr.to_levi(rel_vocab.get_frequency,
                                                shuffle=shuffle_sibling)
            else:
                concept_i, edge_i, _ = amr.root_centered_sort(
                    rel_vocab.get_frequency, shuffle=shuffle_sibling)
            concept.append(concept_i)
            edge.append(edge_i)
        if levi_graph is True:
            concept_with_rel, edge_with_rel = levi_amr(concept,
                                                       edge,
                                                       extra_arc=extra_arc)
            concept = concept_with_rel
            edge = edge_with_rel

        augmented_concept = [[DUM] + x + [END] for x in concept]

        _concept_in = list_to_tensor(augmented_concept,
                                     vocabs.get('concept_and_rel',
                                                vocabs['concept']),
                                     unk_rate=unk_rate)[:-1]
        _concept_char_in = lists_of_string_to_tensor(
            augmented_concept, vocabs['concept_char'])[:-1]
        _concept_out = list_to_tensor(augmented_concept,
                                      vocabs['predictable_concept'],
                                      local_token2idx)[1:]

        out_conc_len, bsz = _concept_out.shape
        _rel = np.full((1 + out_conc_len, bsz, out_conc_len),
                       rel_vocab.pad_idx)
        # v: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}, <end>] u: [<dummy>, concept_0, ..., concept_l, ..., concept_{n-1}]

        for bidx, (x, y) in enumerate(zip(edge, concept)):
            for l, _ in enumerate(y):
                if l > 0:
                    # l=1 => pos=l+1=2
                    _rel[l + 1, bidx, 1:l + 1] = rel_vocab.get_idx(NIL)
            for v, u, r in x:
                if levi_graph:
                    r = 1
                else:
                    r = rel_vocab.get_idx(r)
                assert v > u, 'Invalid typological order'
                _rel[v + 1, bidx, u + 1] = r
        ret.update({
            'concept_in': _concept_in,
            'concept_char_in': _concept_char_in,
            'concept_out': _concept_out,
            'rel': _rel
        })
    else:
        augmented_concept = None

    token_length = ret.get('token_length', None)
    if token_length is not None and not isinstance(token_length, torch.Tensor):
        ret['token_length'] = torch.tensor(
            token_length,
            dtype=torch.long,
            device=device if
            (isinstance(device, torch.device) or device >= 0) else 'cpu:0')
    ret.update({
        'lem': _lem,
        'tok': _tok,
        'pos': _pos,
        'ner': _ner,
        'word_char': _word_char,
        'copy_seq': np.stack([_cp_seq, _mp_seq], -1),
        'local_token2idx': local_token2idx,
        'local_idx2token': local_idx2token
    })
    if squeeze:
        token_field = make_batch_for_squeeze(data, augmented_concept,
                                             tokenizer, device, ret)
    else:
        token_field = 'token'
    subtoken_to_tensor(token_field, ret)
    if bart:
        make_batch_for_bart(augmented_concept, ret, tokenizer, device)
    move_dict_to_device(ret, device)

    return ret
Exemplo n.º 18
0
    def __init__(self,
                 trn: str = None,
                 dev: str = None,
                 tst: str = None,
                 sampler_builder: SamplerBuilder = None,
                 dependencies: str = None,
                 scalar_mix: ScalarMixWithDropoutBuilder = None,
                 use_raw_hidden_states=False,
                 lr=1e-3,
                 separate_optimizer=False,
                 max_seq_len=None,
                 sent_delimiter=None,
                 char_level=False,
                 hard_constraint=False,
                 tagging_scheme=None,
                 crf=False,
                 delimiter_in_entity=None,
                 merge_types: List[str] = None,
                 secondary_encoder=None,
                 token_key='token',
                 dict_whitelist: Union[DictInterface, Union[Dict[str, Any],
                                                            Set[str]]] = None,
                 dict_blacklist: Union[DictInterface, Union[Dict[str, Any],
                                                            Set[str]]] = None,
                 dict_tags: Union[DictInterface,
                                  Union[Dict[Union[str, Sequence[str]],
                                             Union[str,
                                                   Sequence[str]]]]] = None,
                 **kwargs) -> None:
        r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
        NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
        During decoding, it performs longest-prefix-matching of these words to override the prediction from
        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.

        .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
            do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

        Args:
            trn: Path to training set.
            dev: Path to dev set.
            tst: Path to test set.
            sampler_builder: A builder which builds a sampler.
            dependencies: Its dependencies on other tasks.
            scalar_mix: A builder which builds a `ScalarMixWithDropout` object.
            use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling.
            lr: Learning rate for this task.
            separate_optimizer: Use customized separate optimizer for this task.
            max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible.
            sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can
                be split here.
            char_level: Whether the sequence length is measured at char level, which is never the case for
                lemmatization.
            hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter``
                in a sentence, it will be split at a token anyway.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`).
            delimiter_in_entity: The delimiter between tokens in entity, which is used to rebuild entity by joining
                tokens during decoding.
            merge_types: The types of consecutive entities to be merged.
            secondary_encoder: An optional secondary encoder to provide enhanced representation by taking the hidden
                states from the main encoder as input.
            token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL.
            dict_whitelist: A :class:`dict` or a :class:`~hanlp_trie.dictionary.DictInterface` of gazetteers to be
                included into the final results.
            dict_blacklist: A :class:`set` or a :class:`~hanlp_trie.dictionary.DictInterface` of badcases to be
                excluded from the final results.
            **kwargs:
        """
        super().__init__(**merge_locals_kwargs(locals(), kwargs))
        self.vocabs = VocabDict()
        self.secondary_encoder = secondary_encoder
        self.dict_whitelist = dict_whitelist
        self.dict_blacklist = dict_blacklist
        self.dict_tags = dict_tags