Python Logging.warn примеры использования

Язык программирования: Python

Пространство имен/Пакет: nnlib.utils

Класс/Тип: Logging

Метод/Функция: warn

Примеров на hotexamples.com: 8

Python Logging.warn - 8 примеров найдено. Это лучшие примеры Python кода для nnlib.utils.Logging.warn, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

color(9)

warn(8)

verbose(1)

Основные методы

color (9)

warn (8)

verbose (1)

Пример #1

Показать файл

 def _validate(self):
     rules = self.validate() or []
     for pattern, validator in rules:
         regex = re.compile(rf"^{pattern}$")
         matches = 0
         for k in dir(self):
             if not k.startswith('_') and regex.match(k):
                 matches += 1
                 v = getattr(self, k)
                 try:
                     result = validator(v)
                 except Exception:
                     raise ValidationError(k, v, validator.__name__)
                 else:
                     if not result:
                         raise ValidationError(k, v, validator.__name__)
         if matches == 0:
             Logging.warn(
                 f"regex \"{pattern}\" did not match any arguments")

Пример #2

Показать файл

    def _parse_type_spec(cls) -> Dict[str, _ArgTypeSpec]:
        """
        :return: A dict mapping argument names to their type-specs
        """
        _attr_name = '__type_dict__'
        if hasattr(cls, _attr_name):
            return getattr(cls, _attr_name)

        type_dict = {}

        # get annotations from the current class and all its base classes as well
        annotations = {}
        for base in reversed(cls.__mro__):
            if base not in [object, Arguments]:
                annotations.update(base.__dict__.get('__annotations__', {}))

        bad_names = []
        warn_names = []

        def check_name_conventions(name):
            if name.startswith('_') or name.endswith('_'):
                # names should not start or begin with underscores
                bad_names.append(name)
            if name != name.lower() or any(ord(c) >= 128 for c in name):
                # names are recommended to contain non-uppercase ASCII characters only
                warn_names.append(name)

        # check that all attributes are annotated, except for `Switch`es
        for arg_name in dir(cls):
            if arg_name.startswith('__') or cls._check_reserved(
                    arg_name):  # magic stuff
                continue
            arg_val = getattr(cls, arg_name)
            if isinstance(arg_val, Arguments.Switch):
                check_name_conventions(arg_name)
                # noinspection PyProtectedMember,PyCallByClass
                type_dict[arg_name.lower()] = Arguments._ArgTypeSpec(
                    Arguments.Switch,
                    nullable=False,
                    required=False,
                    default=arg_val._default)
            elif arg_name not in cls.__annotations__:
                raise ArgumentError(
                    f"Type is not specified for argument '{arg_name}'. "
                    f"Type annotation can omitted only when argument is a `Switch`."
                )

        # iterate over annotated values and generate type-specs
        for arg_name, arg_typ in annotations.items():
            if cls._check_reserved(arg_name):
                raise ArgumentError(
                    f"'{arg_name}' cannot be used as argument name because it is reserved."
                )

            check_name_conventions(arg_name)

            nullable = False
            # hacky check of whether `arg_typ` is `Optional`: `Optional` is `Union` with `type(None)`
            if getattr(arg_typ, '__origin__',
                       None) is Union and NoneType in arg_typ.__args__:
                nullable = True
                # extract the type wrapped inside `Optional`
                arg_typ = next(t for t in arg_typ.__args__
                               if not isinstance(t, NoneType))  # type: ignore

            arg_val = getattr(cls, arg_name, None)
            required = not hasattr(cls, arg_name) or (arg_val is None
                                                      and not nullable)
            type_dict[arg_name] = Arguments._ArgTypeSpec(arg_typ,
                                                         nullable=nullable,
                                                         required=required,
                                                         default=arg_val)

        if len(bad_names) > 0:
            bad_names_str = ', '.join(f"'{s}'" for s in bad_names)
            raise ArgumentError(f"Invalid argument names: {bad_names_str}. "
                                f"Names cannot begin or end with underscores.")
        if len(warn_names) > 0:
            warn_names_str = ', '.join(f"'{s}'" for s in warn_names)
            Logging.warn(
                f"Consider changing these argument names: {warn_names_str}. "
                f"Names are recommended to contain non-uppercase ASCII characters only."
            )

        setattr(cls, _attr_name, type_dict)
        return type_dict

Пример #3

Показать файл

    def __init__(self, *args, **kwargs) -> None:
        self._check_types()

        for k, v in kwargs.items():
            setattr(self, k, v)

        # TODO: Add non-null checks
        # TODO: Add "no-" prefix stuff for switches
        # TODO: Generate help by inspecting comments

        if len(args) == 0:
            argv = sys.argv
        elif len(args) == 1:
            argv = args[0]
        else:
            raise ValueError(
                f"Argument class takes zero or one positional arguments but {len(args)} were given"
            )
        i = 1
        while i < len(argv):
            arg: str = argv[i]
            if arg.startswith('--'):
                argname = arg[2:].replace('-', '_')
                if argname.startswith('no_') and not hasattr(
                        self, argname) and hasattr(self, argname[3:]):
                    attr = getattr(self, argname[3:])
                    if isinstance(attr, Arguments.Switch):
                        attr._value = False
                        i += 1
                        continue

                if hasattr(self, argname):
                    attr = getattr(self, argname)
                    if isinstance(attr, Arguments.Switch):
                        attr._value = True
                        i += 1
                        continue

                    nullable, typ = self._get_arg_type(argname)
                    argval: str = argv[i + 1]
                    if argval.lower() == 'none':
                        if nullable:
                            val = None
                        else:
                            assert typ is str or is_choices(typ), \
                                f"Cannot assign None to non-nullable, non-str argument '{argname}'"
                            val = argval
                    elif isinstance(typ,
                                    custom_types.NoneType):  # type: ignore
                        val = None  # just to suppress "ref before assign" warning
                        try:
                            # priority: low -> high
                            for target_typ in [str, float, int]:
                                val = target_typ(argval)
                        except ValueError:
                            pass
                    elif typ is str:
                        val = argval
                    elif isinstance(
                            typ,
                            custom_types.Path) or typ is custom_types.Path:
                        val = Path(argval)
                        if isinstance(typ, custom_types.Path) and typ.exists:
                            assert val.exists(), ValueError(
                                f"Argument '{argname}' requires an existing path, "
                                f"but '{argval}' does not exist")
                    elif is_choices(typ):
                        val = argval
                        assert val in typ.__values__, f"Invalid value '{val}' for argument '{arg}', " \
                                                      f"available choices are: {typ.__values__}"
                    elif issubclass(typ, Arguments.Enum):
                        # experimental support for custom enum
                        try:
                            # noinspection PyCallingNonCallable
                            val = typ(argval)
                        except ValueError:
                            valid_args = {x.value for x in typ}
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}', "
                                f"available choices are: {valid_args}"
                            ) from None

                    elif typ is bool:
                        val = argval in ['true', '1', 'True', 'y', 'yes']
                    else:
                        try:
                            val = ast.literal_eval(argval)
                        except ValueError:
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}'"
                            ) from None
                    setattr(self, argname, val)
                    i += 2
                else:
                    raise ValueError(f"Invalid argument: '{arg}'")
            else:
                Logging.warn(f"Unrecognized command line argument: '{arg}'")
                i += 1

        if self.pdb:
            # enter IPython debugger on exception
            from IPython.core import ultratb
            ipython_hook = ultratb.FormattedTB(mode='Context',
                                               color_scheme='Linux',
                                               call_pdb=1)

            def excepthook(type, value, traceback):
                if type is KeyboardInterrupt:
                    # don't capture keyboard interrupts (Ctrl+C)
                    sys.__excepthook__(type, value, traceback)
                else:
                    ipython_hook(type, value, traceback)

            sys.excepthook = excepthook

        self.preprocess()

        # check whether non-optional attributes are none
        for arg in dir(self):
            if not arg.startswith('_') and arg not in self._reserved_keys:
                attr = getattr(self, arg)
                nullable, _ = self._get_arg_type(arg)
                if attr is None and not nullable:
                    raise ValueError(f"argument '{arg}' cannot be none")

        self._validate()
        self.postprocess()

        # convert switches to bool
        for arg in dir(self):
            if not arg.startswith('_') and arg not in self._reserved_keys:
                attr = getattr(self, arg)
                typ = self.__annotations__.get(arg, None)
                if isinstance(attr, Arguments.Switch):
                    # noinspection PyProtectedMember
                    setattr(self, arg, bool(attr))
                if isinstance(typ, type) and issubclass(
                        typ, Path) and isinstance(attr, str):
                    setattr(self, arg, Path(attr))

Пример #4

Показать файл

    def __init__(self, **kwargs) -> None:
        self._check_types()

        for k, v in kwargs.items():
            setattr(self, k, v)

        # TODO: Add non-null checks
        # TODO: Add "no-" prefix stuff for switches
        # TODO: Generate help by inspecting comments

        i = 1
        while i < len(sys.argv):
            arg: str = sys.argv[i]
            if arg.startswith('--'):
                argname = arg[2:].replace('-', '_')
                if argname.startswith('no_') and not hasattr(
                        self, argname) and hasattr(self, argname[3:]):
                    attr = getattr(self, argname[3:])
                    if isinstance(attr, Arguments.Switch):
                        attr._value = False
                        i += 1
                        continue

                if hasattr(self, argname):
                    attr = getattr(self, argname)
                    if isinstance(attr, Arguments.Switch):
                        attr._value = True
                        i += 1
                        continue

                    typ = self.__annotations__.get(argname, type(attr))
                    nullable = False
                    # TODO: hacks here
                    if hasattr(
                            typ,
                            '__origin__') and typ.__origin__ == Union and type(
                                None) in typ.__args__:
                        # hacky check of whether `typ` is `Optional`
                        nullable = True
                        typ = next(t for t in typ.__args__
                                   if not isinstance(t, custom_types.NoneType)
                                   )  # type: ignore
                    argval: str = sys.argv[i + 1]
                    if argval.lower() == 'none':
                        if nullable:
                            val = None
                        else:
                            assert typ is str or is_choices(typ), \
                                f"Cannot assign None to non-nullable, non-str argument '{argname}'"
                            val = argval
                    elif isinstance(typ,
                                    custom_types.NoneType):  # type: ignore
                        val = None  # just to suppress "ref before assign" warning
                        try:
                            # priority: low -> high
                            for target_typ in [str, float, int]:
                                val = target_typ(argval)
                        except ValueError:
                            pass
                    elif typ is str:
                        val = argval
                    elif isinstance(
                            typ,
                            custom_types.Path) or typ is custom_types.Path:
                        val = Path(argval)
                        if isinstance(typ, custom_types.Path) and typ.exists:
                            assert val.exists(), ValueError(
                                f"Argument '{argname}' requires an existing path, "
                                f"but '{argval}' does not exist")
                    elif is_choices(typ):
                        val = argval
                        assert val in typ.__values__, f"Invalid value '{val}' for argument '{arg}', " \
                            f"available choices are: {typ.__values__}"
                    elif issubclass(Arguments.Enum, typ):
                        # experimental support for custom enum
                        try:
                            # noinspection PyCallingNonCallable
                            val = typ(argval)
                        except ValueError:
                            valid_args = {x.value for x in typ}
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}', "
                                f"available choices are: {valid_args}"
                            ) from None

                    elif typ is bool:
                        val = argval in ['true', '1', 'True', 'y', 'yes']
                    else:
                        try:
                            val = ast.literal_eval(argval)
                        except ValueError:
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}'"
                            ) from None
                    setattr(self, argname, val)
                    i += 2
                else:
                    raise ValueError(f"Invalid argument: '{arg}'")
            else:
                Logging.warn(f"Unrecognized command line argument: '{arg}'")
                i += 1

        if self.ipdb:
            # enter IPython debugger on exception
            from IPython.core import ultratb
            sys.excepthook = ultratb.FormattedTB(mode='Context',
                                                 color_scheme='Linux',
                                                 call_pdb=1)

        self.preprocess()
        self._validate()
        self.postprocess()

Пример #5

Показать файл

    def sampling_decode(self, vocab: Dict[str, Vocab], example: LRLMExample,
                        begin_symbol: int = 2, end_symbol: int = 5,
                        initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None,
                        max_length: int = 200, greedy: bool = False, topk: Optional[int] = None,
                        print_info: bool = True, color_outputs: bool = False, show_rel_type: bool = True,
                        sanity_check: bool = False, unkinfo: Optional[Tuple[Tensor, List[str]]] = None, **kwargs) \
            -> SampledOutput:
        r"""
        Sampling for LRLM.

        Output format:
        - Red words:       Copied from canonical form of entity.
        - Green words:     Copied from alias form of entity.
        - Yellow words:    Warm-up context.
        - word_[type]:     "word" is an entity of type "type".
        - @-@:             A dash in the original text without spaces around, e.g. M @-@ 82 => M-82.

        :param vocab: Vocabulary containing id2word mapping.
        :param example: The :class:`Example` object of the current topic.
        :param begin_symbol: Start of sentence symbol.
        :param end_symbol: End of sentence symbol. Sampling stops when this symbol is generated.
        :param initial_hidden: If not specified, default hidden states returned by :meth:`init_hidden` is used.
        :param warm_up: Number of tokens to provide as context before performing sampling.
        :param max_length: If generated sentence exceeds specified length, sampling is force terminated.
        :param greedy: If ``True``, use greedy decoding instead of sampling.
        :param topk: If not ``None``, only sample from indices with top-k probabilites.

        :param print_info: If ``True``, print information about sampled result.
        :param color_outputs: If ``True``, include annotations for each output token. Tokens from entities will be
            colored red.
        :param show_rel_type: If ``True``, show relation types for copied entities.

        :param sanity_check: If ``True``, perform sanity check on generated sample.

        :param unkinfo: Precomputed unkprobs and the index-to-vocabulary mapping.

        :return: A tuple of (loss_value, formatted list of words).
        """
        if unkinfo is not None:
            unkprob, unki2w = unkinfo
            unkprob = unkprob[self._vocab_size:]
            unki2w = unki2w[self._vocab_size:]
            normalized_unkprob = F.log_softmax(unkprob, dim=0)

        # noinspection PyPep8Naming
        UNK, INVALID, CANONICAL_IDX, WORD_PREDICTOR, REL_PREDICTOR, EPS = -100, -1, 0, 0, 1, 1e-4

        self.eval()
        self.init_hidden(1, [example.relations])

        word_vocab, rel_vocab = vocab['word'], vocab['rel']

        tensor = functools.partial(sample_utils.tensor, device=self.device)
        sample = functools.partial(sample_utils.sample,
                                   greedy=greedy,
                                   topk=topk)
        np_sample = functools.partial(sample_utils.np_sample,
                                      greedy=greedy,
                                      topk=topk)

        # noinspection PyShadowingNames
        def compute_loss(
                inputs: List[int],
                spans: List[MatchedSpan],
                hidden: Optional[HiddenState] = None
        ) -> Tuple[float, HiddenState]:
            batch = SimpleNamespace(
                sequence=tensor(inputs[:-1]),
                target=tensor(inputs[1:]),
                spans=[spans],
                unkprob=None,
                lengths=torch.tensor([len(inputs) - 1], device=self.device),
                ntokens=len(inputs) - 1,
            )
            loss, next_hidden = self.calc_loss(batch,
                                               hidden=hidden)  # type: ignore
            return loss.item(), next_hidden

        if warm_up is None:
            inputs = [begin_symbol]
            rel_ids = [INVALID]
            surface_indices = [INVALID]
            spans: List[MatchedSpan] = []
            total_log_prob = 0.0
            marginal_log_prob = 0.0
            hidden = initial_hidden
        else:
            inputs = list(word_vocab.numericalize(example.sentence[:warm_up]))
            rel_ids = [INVALID] * len(
                inputs)  # assume everything is generated from vocabulary
            surface_indices = [INVALID] * len(inputs)
            spans = [span for span in example.spans if span.end < warm_up]
            loss, hidden = compute_loss(inputs, spans, initial_hidden)
            total_log_prob = -loss * (len(inputs) - 1)
            marginal_log_prob = -loss * (len(inputs) - 1)

        while len(inputs) < max_length and inputs[-1] != end_symbol:
            computed_log_probs, new_hidden = self._compute_log_probs(
                tensor(inputs[-1]), hidden)
            predictor, selector_loss = sample(computed_log_probs.selector)

            if predictor == REL_PREDICTOR:
                rel_id, rel_loss = sample(computed_log_probs.rel[0])

                if self._alias_disamb is AliasDisamb.FastText:
                    assert computed_log_probs.alias_logits is not None
                    aliases = example.relations[rel_id].obj_alias
                    alias_vecs = self.alias_vec[aliases]
                    surface_log_prob = F.log_softmax(torch.mv(
                        alias_vecs, computed_log_probs.alias_logits.flatten()),
                                                     dim=0)
                    surface_idx, alias_loss = sample(surface_log_prob)
                    alias = self.alias_list[aliases[surface_idx]]
                else:
                    # can't tell which one under oracle, use the canonical (first) alias
                    surface_idx = 0
                    alias_loss = 0.0
                    alias = example.relations[rel_id].obj_alias[
                        0]  # type: ignore

                # forward the hidden state according to the generated in-vocab tokens
                raw_tokens: List[str] = alias.split()
                token_ids: List[int] = word_vocab.numericalize(raw_tokens)
                if len(raw_tokens) > 1:
                    _, new_hidden = self._compute_log_probs(
                        tensor(token_ids[:-1]), new_hidden)

                # compute marginal probability for current span
                span_inputs = tensor([inputs[-1]] + token_ids[:-1])
                span_computed_log_probs, _ = self._compute_log_probs(
                    span_inputs, hidden)
                word_gen_loss = torch.sum(
                    span_computed_log_probs.selector[0, :, WORD_PREDICTOR] +
                    torch.gather(span_computed_log_probs.word,
                                 index=tensor(token_ids).unsqueeze(-1),
                                 dim=2).flatten()).item()
                marginal_log_prob += torch.logsumexp(tensor(
                    [selector_loss + rel_loss + alias_loss, word_gen_loss]),
                                                     dim=1).item()

                spans.append(
                    MatchedSpan(
                        len(inputs) - 1,
                        len(inputs) + len(token_ids) - 1,
                        example.relations[rel_id].rel_typ, rel_id,
                        surface_idx))
                inputs.extend(token_ids)
                rel_ids.extend([rel_id] + [INVALID] * (len(token_ids) - 1))
                surface_indices.extend([surface_idx] + [INVALID] *
                                       (len(token_ids) - 1))

                total_log_prob += selector_loss + rel_loss + alias_loss
            elif predictor == WORD_PREDICTOR:
                word, word_loss = sample(computed_log_probs.word)
                total_log_prob += selector_loss + word_loss
                marginal_log_prob += selector_loss + word_loss

                if word == 0 and unkinfo is not None:  # unk
                    unk_idx, unk_loss = np_sample(normalized_unkprob)
                    total_log_prob += unk_loss
                    marginal_log_prob += unk_loss
                    # Ugly multi-purpose use of variables.
                    surface_indices.append(
                        unk_idx)  # Record unk word index in surface_indices.
                    rel_ids.append(UNK)  # Record UNK in rel_ids.
                else:
                    rel_ids.append(INVALID)
                    surface_indices.append(INVALID)

                inputs.append(word)
            else:
                raise ValueError

            hidden = new_hidden

        sample_loss = -total_log_prob / (len(inputs) - 1)
        marginal_loss = -marginal_log_prob / (len(inputs) - 1)
        if print_info:
            print(
                f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}"
            )
            print(
                f"Marginal sample loss: {marginal_loss:.3f}, PPL: {math.exp(marginal_loss):.3f}"
            )
        # Sanity checks
        if sanity_check:
            # noinspection PyTypeChecker
            loss_val, gold_hidden = compute_loss(inputs, spans, initial_hidden)
            assert hidden is not None
            hidden_state_diff = max(
                torch.max(torch.abs(g - h)).item()
                for g, h in zip(gold_hidden, hidden))
            if hidden_state_diff > EPS:
                Logging.warn(
                    f"Hidden states do not match. Difference: {hidden_state_diff}"
                )
            if abs(marginal_loss - loss_val) > EPS:
                Logging.warn(
                    f"Marginal loss values do not match. "
                    f"Forward loss: {loss_val}, difference: {abs(marginal_loss - loss_val)}"
                )

        num_rels_generated = sum(int(rel_id != INVALID) for rel_id in rel_ids)
        if print_info:
            print(
                f"Relations [Generated / Annotated]: "
                f"[{num_rels_generated} / {len([s for s in example.spans if s.end < max_length])}]"
            )

        words = []
        idx = 0
        copy_count = 0
        while idx < len(inputs):
            is_warm_up = (warm_up is not None and idx < warm_up)
            token_id, rel_id, surface_idx = inputs[idx], rel_ids[
                idx], surface_indices[idx]
            if rel_id == INVALID:
                token = word_vocab.i2w[token_id]
                idx += 1
            elif rel_id == UNK:
                token = Logging.color('blue', unki2w[surface_idx])
                idx += 1
            else:
                copy_count += 1
                word_id = example.relations[rel_id].obj_alias[
                    surface_idx]  # multiple words
                token = self.alias_list[word_id]
                idx += len(token.split())
                if show_rel_type:
                    token = f"{token}_[{rel_vocab.i2w[example.relations[rel_id].rel_typ]}]"
                if color_outputs and not is_warm_up:
                    token = Logging.color(
                        'red' if surface_idx == CANONICAL_IDX else 'green',
                        token)
            if color_outputs and is_warm_up:
                token = Logging.color('yellow', token)
            words.append(token)

        if print_info:
            print(f"# of copied entities: {copy_count}")

        output = SampledOutput(sentence=words,
                               sample_loss=sample_loss,
                               complete_copies=copy_count,
                               incomplete_copies=0)
        return output

Пример #6

Показать файл

def main():
    Logging.verbosity_level = Logging.VERBOSE

    Logging.warn("This program requires lots of memory (preferably >= 30GB).")

    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True)

    # Read the Wikimedia IDs for each article, and filter the relations
    topic_ids: Set[WikidataID] = set()
    split_title_id: Dict[str, List[Tuple[str, WikidataID]]] = {}
    for split in ['train', 'valid', 'test']:
        with utils.work_in_progress(f"Loading {split} set titles"), \
             open(TOPIC_JSON_PATH(split=split)) as f:
            j = json.load(f)
        split_title_id[split] = [(article['title'], WikidataID(article['id']))
                                 for article in j]
        topic_ids.update([wid for _, wid in split_title_id[split]])
        del j

    with utils.work_in_progress("Loading Wikidata ID mapping"):
        id2rel = load_id2str(WIKIDATA_DUMP_DIR / 'properties.txt')

    # Match the relations
    matched_dataset = read_data(ALIGNED_DATA_DIR)

    # Gather entities & relation vectors
    found_entities = set()
    found_rels = set()
    for split in matched_dataset:
        for example in matched_dataset[split]:
            found_entities.add(example.topic_id)
            for rel in example.relations:
                found_entities.add(rel.obj_id)
                found_rels.add(rel.rel_typ)
    found_entities -= {UNK_ENTITY}
    found_rels -= {NAF, ANCHOR, TOPIC_ITSELF}
    with utils.work_in_progress("Building rel vecs"):
        rel_map = load_relations(found_rels)
        rel_map.update({NAF: -1, ANCHOR: -2, TOPIC_ITSELF: -3})
        unk_rels = found_rels.difference(rel_map)
        # NOTE: unk_rels is a set, its order is undetermined, so we sort it to make sure it's consistent between runs
        for idx, rel in enumerate(sorted(unk_rels)):
            rel_map[rel] = -idx - 4  # starting from -4, going towards -inf
    with utils.work_in_progress("Building entity vecs"):
        entity_map = load_entities(found_entities)
        entity_map.update({UNK_ENTITY: -1})
        print(
            f"Topic ID coverage: {len(topic_ids.intersection(entity_map))}/{len(topic_ids)}"
        )

    # save relation type names for use during generation
    id_to_rel_name = dict(id2rel)
    id_to_rel_name.update({
        NAF: 'Not-A-Fact',
        ANCHOR: 'ANCHOR',
        TOPIC_ITSELF: 'TITLE'
    })
    rel_names: Dict[int, str] = {}
    for r_rel, rel_id in rel_map.items():
        rel_names[rel_id] = id_to_rel_name[r_rel]
    with (SAVE_DIR / 'rel_names.pkl').open('wb') as f:
        pickle.dump(rel_names, f)
        print(f"Relation names saved to {(SAVE_DIR / 'rel_names.pkl')}")

    # Convert into numbers to create the final dataset
    for split in matched_dataset:
        with utils.work_in_progress(f"Converting {split} set"):
            dataset, matched_spans = numericalize_rel(matched_dataset[split],
                                                      rel_map, entity_map)

        path = SAVE_DIR / f'{split}.pkl'
        with path.open('wb') as f:
            pickle.dump(dataset, f)
        print(
            f"Dataset split '{split}' saved to {path}, {len(dataset)} examples"
        )

        path = SAVE_DIR / f'{split}.span.pkl'
        with path.open('wb') as f:
            pickle.dump(matched_spans, f)
        print(f"Matched spans split '{split}' saved to {path}")

Пример #7

Показать файл

def read_data(path: Path) -> Dict[str, List[RawExampleWikiID]]:
    bad_examples: List[Tuple[str, int, str]] = []
    data = {}
    for split in ['train', 'valid', 'test']:
        with (path / f'{split}.pkl').open('rb') as f:
            # relation tuple: (span, rel_type_desc, name, canonical_name)
            with utils.work_in_progress(f"Loading {split} set"):
                dump: List[RawDump] = pickle.load(f)

            examples = []
            for idx, (sent, rels) in enumerate(
                    utils.progress(dump, desc='Reading data')):
                # map (rel_typ, canonical) to list of aliases, since lists aren't hashable
                rel_to_alias: Dict[Tuple[str, str], List[str]] = \
                    {(rel[0][0], obj_id): alias for obj_id, _, _, rel, _, alias in rels}

                # sort it so the order is consistent
                relations: List[RelationWikiID] = sorted([
                    RelationWikiID(WikidataID(rel_id), WikidataID(obj_id),
                                   obj_alias)
                    for (rel_id, obj_id), obj_alias in rel_to_alias.items()
                ])
                rel_to_id: Dict[Tuple[str, str], int] = {
                    (rel_id, obj_id): idx
                    for idx, (rel_id, obj_id,
                              obj_alias) in enumerate(relations)
                }
                # dedup to remove duplicate (-1, -1)
                mentions: List[EntityMention] = list(
                    set(
                        EntityMention(span, surface, rel_to_id[(rel_info[0][0],
                                                                obj_id)]) for
                        obj_id, head_id, span, rel_info, surface, _ in rels))
                try:
                    # must exist - head id with the relation: @TITLE@ is the topic WikidataID
                    topic_id = next(
                        head_id
                        for _, head_id, _, rel_info, surface, alias in rels
                        if rel_info[0][0] == "@TITLE@")
                except StopIteration:
                    bad_examples.append((split, idx, ' '.join(sent)[:100]))
                    continue

                converted_relations = []
                for r in relations:
                    converted_relations.append(
                        RelationWikiID(
                            TOPIC_ITSELF if r.rel_typ == "@TITLE@" else
                            r.rel_typ, r.obj, r.obj_alias))

                example = RawExampleWikiID(WikidataID(topic_id), sent,
                                           converted_relations, mentions)
                examples.append(example)
            data[split] = examples

    if len(bad_examples) > 0:
        Logging.warn(f"{len(bad_examples)} bad examples:\n"
                     f"{pprint.pformat(bad_examples)}")
    else:
        Logging.verbose("All examples are good")

    return data

Пример #8

Показать файл

Файл: nklm.py Проект: neulab/lrlm

    def sampling_decode(self, vocab: Dict[str, Vocab], example: NKLMExample,
                        begin_symbol: int = 2, end_symbol: int = 5,
                        initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None,
                        max_length: int = 200, greedy=False, topk=None,
                        fill_incomplete=False, allow_invalid_pos=False,
                        print_info=True, color_outputs=False, color_incomplete=True,
                        show_ellipses=True, show_rel_type=True, show_copy_pos=False,
                        sanity_check=False, unkinfo: Optional[Tuple[Tensor, List[str]]] = None, **kwargs) \
            -> SampledOutput:
        """
        Sampling for NKLM.

        Output format:
        - Red words:       Copied from canonical form of entity.
        - Green words:     Copied from alias form of entity.
        - Yellow words:    Warm-up context.
        - word_[type]:     "word" is an entity of type "type".
        - word...(a_b_c):  "word" is a partially copied entity with remaining suffix "a b c".
        - (a_b_c)...word:  "word" is a partially copied entity with remaining prefix "a b c".
        - @-@:             A dash in the original text without spaces around, e.g. M @-@ 82 => M-82.
        - <X>:             A token copied from an invalid position of an entity.

        :param vocab: Vocabulary containing id2word mapping.
        :param example: The :class:`Example` object of the current topic.
        :param begin_symbol: Start of sentence symbol.
        :param end_symbol: End of sentence symbol. Sampling stops when this symbol is generated.
        :param initial_hidden: If not specified, default hidden states returned by :meth:`init_hidden` is used.
        :param warm_up: Number of tokens to provide as context before performing sampling.
        :param max_length: If generated sentence exceeds specified length, sampling is force terminated.
        :param greedy: If ``True``, use greedy decoding instead of sampling.
        :param topk: If not ``None``, only sample from indices with top-k probabilites.
        :param fill_incomplete: If ``True``, entities that are partially copied will be completed.
        :param allow_invalid_pos: If ``True``, allowing copying from invalid positions, and use <unk> as input.

        :param print_info: If ``True``, print information about sampled result.
        :param color_outputs: If ``True``, include annotations for each output token. Tokens from entities will be
            colored red.
        :param color_incomplete: If ``True`` and ``color_outputs`` is ``True``, also color partially copied entities.
        :param show_ellipses: If ``True``, show ellipses at beginning or end of partially copied entities.
        :param show_rel_type: If ``True``, show relation types for copied entities.
        :param show_copy_pos: If ``True``, show the position from which the entity tokens are copied.

        :param sanity_check: If ``True``, perform sanity check on generated sample.

        :return: A tuple of (loss_value, formatted list of words).
        """

        if unkinfo is not None:
            unkprob, unki2w = unkinfo
            unkprob = unkprob[self._vocab_size:]
            unki2w = unki2w[self._vocab_size:]
            normalized_unkprob = F.log_softmax(unkprob, dim=0)

        # noinspection PyPep8Naming
        UNK, INVALID, UNK_TOKEN, CANONICAL_IDX, EPS = -100, -1, 0, 0, 1e-4

        self.eval()
        self.init_hidden(1, [example.relations])

        word_vocab, rel_vocab = vocab['word'], vocab['rel']

        tensor = functools.partial(sample_utils.tensor, device=self.device)
        randint = sample_utils.randint
        sample = functools.partial(sample_utils.sample,
                                   greedy=greedy,
                                   topk=topk)
        np_sample = functools.partial(sample_utils.np_sample,
                                      greedy=greedy,
                                      topk=topk)

        # noinspection PyShadowingNames
        def compute_loss(
                inputs: List[int],
                rel_ids: List[int],
                copy_pos: List[int],
                surface_indices: List[int],
                hidden: Optional[HiddenState] = None
        ) -> Tuple[float, HiddenState]:
            batch = SimpleNamespace(
                sequence=tensor(inputs[:-1]),
                target=tensor(inputs[1:]),
                unkprob=None,
                seqs={
                    'rel_ids': tensor(rel_ids),
                    'copy_pos': tensor(copy_pos),
                    'surface_indices': tensor(surface_indices)
                },
                ntokens=len(inputs) - 1,
            )
            loss, next_hidden = self.calc_loss(batch,
                                               hidden=hidden)  # type: ignore
            return loss.item(), next_hidden

        # Initialization
        if warm_up is None:
            inputs = [begin_symbol]
            rel_ids = [INVALID]
            copy_pos = [INVALID]
            surface_indices = [INVALID]
            total_log_prob = 0.0
            hidden = initial_hidden
        else:
            inputs = list(word_vocab.numericalize(example.sentence[:warm_up]))
            rel_ids = list(example.rel_ids[:warm_up])
            copy_pos = list(example.copy_pos[:warm_up])
            surface_indices = list(example.surface_indices[:warm_up])
            total_log_prob, hidden = compute_loss(inputs, rel_ids, copy_pos,
                                                  surface_indices,
                                                  initial_hidden)
            total_log_prob = -total_log_prob * (len(inputs) - 1)

        # Sampling procedure
        while len(inputs) < max_length and inputs[-1] != end_symbol:
            fact_log_probs, output, _, next_hidden = \
                self._compute_fact_log_probs(tensor(inputs[-1]), tensor(rel_ids[-1]), tensor(copy_pos[-1]), hidden)
            rel_id, fact_loss = sample(fact_log_probs[0])
            rel_id -= 1
            total_log_prob += fact_loss

            # next_fact_embed: (1, 1, fact_embed_dim)
            next_fact_embed = self._get_fact_embeds(tensor(rel_id))
            copy_indicator, alias_log_probs, pos_log_probs, vocab_log_probs = \
                self._compute_generate_log_probs(output, next_fact_embed, tensor([rel_ids[-1], rel_id]))
            if torch.bernoulli(copy_indicator).item():
                total_log_prob += torch.log(copy_indicator).item()
                assert rel_id != -1
                # copy entity
                aliases = example.relations[rel_id].obj_alias
                if self._alias_disamb is AliasDisamb.FastText:
                    assert alias_log_probs is not None
                    surface_idx, surface_loss = sample(alias_log_probs[0])
                else:
                    surface_idx, surface_loss = 0, 0.0
                alias = self.alias_list[aliases[surface_idx]]

                entity: List[str] = alias.split()
                # normalization not required
                # TODO: keep consistent with _mask_invalid_pos setting
                pos, pos_loss = sample(pos_log_probs if allow_invalid_pos else
                                       pos_log_probs.squeeze()[:len(entity)])
                if self._mask_invalid_pos:
                    pos_loss -= torch.logsumexp(
                        pos_log_probs.squeeze()[:len(entity)], dim=0).item()
                total_log_prob += surface_loss + pos_loss
                token = UNK_TOKEN if pos >= len(
                    entity) else word_vocab.w2i.get(entity[pos], UNK_TOKEN)
            else:
                total_log_prob += torch.log(1.0 - copy_indicator).item()
                assert rel_id == -1
                # generate word
                token, token_loss = sample(vocab_log_probs)
                total_log_prob += token_loss
                pos = INVALID
                surface_idx = INVALID

                if token == 0 and unkinfo is not None:  # unk
                    unk_idx, unk_loss = np_sample(normalized_unkprob)
                    total_log_prob += unk_loss
                    # Ugly multi-purpose use of variables.
                    surface_idx = unk_idx  # Record unk word index in surface_indices.
                    rel_id = UNK  # Record UNK in rel_ids.

            inputs.append(token)
            rel_ids.append(rel_id)
            copy_pos.append(pos)
            surface_indices.append(surface_idx)
            hidden = next_hidden

        sample_loss = -total_log_prob / (len(inputs) - 1)
        if print_info:
            print(
                f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}"
            )
        # Sanity checks
        if sanity_check:
            loss_val, gold_hidden = compute_loss(inputs, rel_ids, copy_pos,
                                                 surface_indices,
                                                 initial_hidden)
            assert hidden is not None
            hidden_state_diff = max(
                torch.max(torch.abs(g - h)).item()
                for g, h in zip(gold_hidden, hidden))
            if hidden_state_diff > EPS:
                Logging.warn(
                    f"Hidden states do not match. Difference: {hidden_state_diff}"
                )
            if abs(sample_loss - loss_val) > EPS:
                Logging.warn(
                    f"Loss values do not match. "
                    f"Forward loss: {loss_val}, difference: {abs(sample_loss - loss_val)}"
                )

        # Format the output
        sentence = list(zip(inputs, rel_ids, copy_pos, surface_indices))
        words = []
        copy_count = 0
        complete_count = 0
        last_entity = None
        entity_continuing = False
        for idx, (token, rel_id, pos, surface_idx) in enumerate(sentence):
            is_warm_up = (warm_up is not None and idx < warm_up)
            if rel_id == INVALID:
                word = word_vocab.i2w[token]
            elif rel_id == UNK:
                word = Logging.color('blue', unki2w[surface_idx])
            else:
                copy_count += 1
                entity_id = example.relations[rel_id].obj_alias[surface_idx]
                entity = self.alias_list[entity_id].split()
                if pos >= len(entity):
                    word = "<X>"
                else:
                    word = entity[pos]
                if show_copy_pos:
                    word = f"{pos}_{rel_id}_{surface_idx}_{word}"
                is_last_word_in_entity = (idx == len(sentence) - 1
                                          or sentence[idx + 1][1:] !=
                                          (rel_id, pos + 1, surface_idx))
                is_first_word_in_entity = (idx == 0 or sentence[idx - 1][1:] !=
                                           (rel_id, pos - 1, surface_idx))
                # add entity tag after the last word
                if show_rel_type and is_last_word_in_entity:
                    word = f"{word}_[{rel_vocab.i2w[example.relations[rel_id].rel_typ]}]"

                # check whether fully copied
                if show_ellipses:
                    if pos < len(entity) - 1 and is_last_word_in_entity:
                        word = word + '...' + (
                            f"({'_'.join(entity[(pos + 1):])})"
                            if fill_incomplete else "")
                    if pos > 0 and is_first_word_in_entity:
                        word = (f"({'_'.join(entity[:pos])})"
                                if fill_incomplete else "") + '...' + word

                if entity_continuing:
                    if last_entity == (rel_id, surface_idx,
                                       pos - 1):  # Continuing
                        last_entity = (rel_id, surface_idx, pos)
                    else:
                        entity_continuing = False
                        last_entity = None

                if pos == 0 and not entity_continuing:  # reset
                    entity_continuing = True
                    last_entity = (rel_id, surface_idx, 0)

                if color_outputs and not is_warm_up and (color_incomplete
                                                         or entity_continuing):
                    word = Logging.color(
                        'red' if surface_idx == 0 else 'green', word)

                if pos == len(entity) - 1 and entity_continuing:  # commit
                    entity_continuing = False
                    complete_count += 1

            if color_outputs and is_warm_up:
                word = Logging.color('yellow', word)
            words.append(word)

        if print_info:
            print(f"Copied, Completed: {copy_count}, {complete_count}")
        sampled_output = SampledOutput(sentence=words,
                                       sample_loss=sample_loss,
                                       complete_copies=complete_count,
                                       incomplete_copies=copy_count)
        return sampled_output