Пример #1
0
 def _validate(self):
     rules = self.validate() or []
     for pattern, validator in rules:
         regex = re.compile(rf"^{pattern}$")
         matches = 0
         for k in dir(self):
             if not k.startswith('_') and regex.match(k):
                 matches += 1
                 v = getattr(self, k)
                 try:
                     result = validator(v)
                 except Exception:
                     raise ValidationError(k, v, validator.__name__)
                 else:
                     if not result:
                         raise ValidationError(k, v, validator.__name__)
         if matches == 0:
             Logging.warn(
                 f"regex \"{pattern}\" did not match any arguments")
Пример #2
0
    def _parse_type_spec(cls) -> Dict[str, _ArgTypeSpec]:
        """
        :return: A dict mapping argument names to their type-specs
        """
        _attr_name = '__type_dict__'
        if hasattr(cls, _attr_name):
            return getattr(cls, _attr_name)

        type_dict = {}

        # get annotations from the current class and all its base classes as well
        annotations = {}
        for base in reversed(cls.__mro__):
            if base not in [object, Arguments]:
                annotations.update(base.__dict__.get('__annotations__', {}))

        bad_names = []
        warn_names = []

        def check_name_conventions(name):
            if name.startswith('_') or name.endswith('_'):
                # names should not start or begin with underscores
                bad_names.append(name)
            if name != name.lower() or any(ord(c) >= 128 for c in name):
                # names are recommended to contain non-uppercase ASCII characters only
                warn_names.append(name)

        # check that all attributes are annotated, except for `Switch`es
        for arg_name in dir(cls):
            if arg_name.startswith('__') or cls._check_reserved(
                    arg_name):  # magic stuff
                continue
            arg_val = getattr(cls, arg_name)
            if isinstance(arg_val, Arguments.Switch):
                check_name_conventions(arg_name)
                # noinspection PyProtectedMember,PyCallByClass
                type_dict[arg_name.lower()] = Arguments._ArgTypeSpec(
                    Arguments.Switch,
                    nullable=False,
                    required=False,
                    default=arg_val._default)
            elif arg_name not in cls.__annotations__:
                raise ArgumentError(
                    f"Type is not specified for argument '{arg_name}'. "
                    f"Type annotation can omitted only when argument is a `Switch`."
                )

        # iterate over annotated values and generate type-specs
        for arg_name, arg_typ in annotations.items():
            if cls._check_reserved(arg_name):
                raise ArgumentError(
                    f"'{arg_name}' cannot be used as argument name because it is reserved."
                )

            check_name_conventions(arg_name)

            nullable = False
            # hacky check of whether `arg_typ` is `Optional`: `Optional` is `Union` with `type(None)`
            if getattr(arg_typ, '__origin__',
                       None) is Union and NoneType in arg_typ.__args__:
                nullable = True
                # extract the type wrapped inside `Optional`
                arg_typ = next(t for t in arg_typ.__args__
                               if not isinstance(t, NoneType))  # type: ignore

            arg_val = getattr(cls, arg_name, None)
            required = not hasattr(cls, arg_name) or (arg_val is None
                                                      and not nullable)
            type_dict[arg_name] = Arguments._ArgTypeSpec(arg_typ,
                                                         nullable=nullable,
                                                         required=required,
                                                         default=arg_val)

        if len(bad_names) > 0:
            bad_names_str = ', '.join(f"'{s}'" for s in bad_names)
            raise ArgumentError(f"Invalid argument names: {bad_names_str}. "
                                f"Names cannot begin or end with underscores.")
        if len(warn_names) > 0:
            warn_names_str = ', '.join(f"'{s}'" for s in warn_names)
            Logging.warn(
                f"Consider changing these argument names: {warn_names_str}. "
                f"Names are recommended to contain non-uppercase ASCII characters only."
            )

        setattr(cls, _attr_name, type_dict)
        return type_dict
Пример #3
0
    def __init__(self, *args, **kwargs) -> None:
        self._check_types()

        for k, v in kwargs.items():
            setattr(self, k, v)

        # TODO: Add non-null checks
        # TODO: Add "no-" prefix stuff for switches
        # TODO: Generate help by inspecting comments

        if len(args) == 0:
            argv = sys.argv
        elif len(args) == 1:
            argv = args[0]
        else:
            raise ValueError(
                f"Argument class takes zero or one positional arguments but {len(args)} were given"
            )
        i = 1
        while i < len(argv):
            arg: str = argv[i]
            if arg.startswith('--'):
                argname = arg[2:].replace('-', '_')
                if argname.startswith('no_') and not hasattr(
                        self, argname) and hasattr(self, argname[3:]):
                    attr = getattr(self, argname[3:])
                    if isinstance(attr, Arguments.Switch):
                        attr._value = False
                        i += 1
                        continue

                if hasattr(self, argname):
                    attr = getattr(self, argname)
                    if isinstance(attr, Arguments.Switch):
                        attr._value = True
                        i += 1
                        continue

                    nullable, typ = self._get_arg_type(argname)
                    argval: str = argv[i + 1]
                    if argval.lower() == 'none':
                        if nullable:
                            val = None
                        else:
                            assert typ is str or is_choices(typ), \
                                f"Cannot assign None to non-nullable, non-str argument '{argname}'"
                            val = argval
                    elif isinstance(typ,
                                    custom_types.NoneType):  # type: ignore
                        val = None  # just to suppress "ref before assign" warning
                        try:
                            # priority: low -> high
                            for target_typ in [str, float, int]:
                                val = target_typ(argval)
                        except ValueError:
                            pass
                    elif typ is str:
                        val = argval
                    elif isinstance(
                            typ,
                            custom_types.Path) or typ is custom_types.Path:
                        val = Path(argval)
                        if isinstance(typ, custom_types.Path) and typ.exists:
                            assert val.exists(), ValueError(
                                f"Argument '{argname}' requires an existing path, "
                                f"but '{argval}' does not exist")
                    elif is_choices(typ):
                        val = argval
                        assert val in typ.__values__, f"Invalid value '{val}' for argument '{arg}', " \
                                                      f"available choices are: {typ.__values__}"
                    elif issubclass(typ, Arguments.Enum):
                        # experimental support for custom enum
                        try:
                            # noinspection PyCallingNonCallable
                            val = typ(argval)
                        except ValueError:
                            valid_args = {x.value for x in typ}
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}', "
                                f"available choices are: {valid_args}"
                            ) from None

                    elif typ is bool:
                        val = argval in ['true', '1', 'True', 'y', 'yes']
                    else:
                        try:
                            val = ast.literal_eval(argval)
                        except ValueError:
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}'"
                            ) from None
                    setattr(self, argname, val)
                    i += 2
                else:
                    raise ValueError(f"Invalid argument: '{arg}'")
            else:
                Logging.warn(f"Unrecognized command line argument: '{arg}'")
                i += 1

        if self.pdb:
            # enter IPython debugger on exception
            from IPython.core import ultratb
            ipython_hook = ultratb.FormattedTB(mode='Context',
                                               color_scheme='Linux',
                                               call_pdb=1)

            def excepthook(type, value, traceback):
                if type is KeyboardInterrupt:
                    # don't capture keyboard interrupts (Ctrl+C)
                    sys.__excepthook__(type, value, traceback)
                else:
                    ipython_hook(type, value, traceback)

            sys.excepthook = excepthook

        self.preprocess()

        # check whether non-optional attributes are none
        for arg in dir(self):
            if not arg.startswith('_') and arg not in self._reserved_keys:
                attr = getattr(self, arg)
                nullable, _ = self._get_arg_type(arg)
                if attr is None and not nullable:
                    raise ValueError(f"argument '{arg}' cannot be none")

        self._validate()
        self.postprocess()

        # convert switches to bool
        for arg in dir(self):
            if not arg.startswith('_') and arg not in self._reserved_keys:
                attr = getattr(self, arg)
                typ = self.__annotations__.get(arg, None)
                if isinstance(attr, Arguments.Switch):
                    # noinspection PyProtectedMember
                    setattr(self, arg, bool(attr))
                if isinstance(typ, type) and issubclass(
                        typ, Path) and isinstance(attr, str):
                    setattr(self, arg, Path(attr))
Пример #4
0
    def __init__(self, **kwargs) -> None:
        self._check_types()

        for k, v in kwargs.items():
            setattr(self, k, v)

        # TODO: Add non-null checks
        # TODO: Add "no-" prefix stuff for switches
        # TODO: Generate help by inspecting comments

        i = 1
        while i < len(sys.argv):
            arg: str = sys.argv[i]
            if arg.startswith('--'):
                argname = arg[2:].replace('-', '_')
                if argname.startswith('no_') and not hasattr(
                        self, argname) and hasattr(self, argname[3:]):
                    attr = getattr(self, argname[3:])
                    if isinstance(attr, Arguments.Switch):
                        attr._value = False
                        i += 1
                        continue

                if hasattr(self, argname):
                    attr = getattr(self, argname)
                    if isinstance(attr, Arguments.Switch):
                        attr._value = True
                        i += 1
                        continue

                    typ = self.__annotations__.get(argname, type(attr))
                    nullable = False
                    # TODO: hacks here
                    if hasattr(
                            typ,
                            '__origin__') and typ.__origin__ == Union and type(
                                None) in typ.__args__:
                        # hacky check of whether `typ` is `Optional`
                        nullable = True
                        typ = next(t for t in typ.__args__
                                   if not isinstance(t, custom_types.NoneType)
                                   )  # type: ignore
                    argval: str = sys.argv[i + 1]
                    if argval.lower() == 'none':
                        if nullable:
                            val = None
                        else:
                            assert typ is str or is_choices(typ), \
                                f"Cannot assign None to non-nullable, non-str argument '{argname}'"
                            val = argval
                    elif isinstance(typ,
                                    custom_types.NoneType):  # type: ignore
                        val = None  # just to suppress "ref before assign" warning
                        try:
                            # priority: low -> high
                            for target_typ in [str, float, int]:
                                val = target_typ(argval)
                        except ValueError:
                            pass
                    elif typ is str:
                        val = argval
                    elif isinstance(
                            typ,
                            custom_types.Path) or typ is custom_types.Path:
                        val = Path(argval)
                        if isinstance(typ, custom_types.Path) and typ.exists:
                            assert val.exists(), ValueError(
                                f"Argument '{argname}' requires an existing path, "
                                f"but '{argval}' does not exist")
                    elif is_choices(typ):
                        val = argval
                        assert val in typ.__values__, f"Invalid value '{val}' for argument '{arg}', " \
                            f"available choices are: {typ.__values__}"
                    elif issubclass(Arguments.Enum, typ):
                        # experimental support for custom enum
                        try:
                            # noinspection PyCallingNonCallable
                            val = typ(argval)
                        except ValueError:
                            valid_args = {x.value for x in typ}
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}', "
                                f"available choices are: {valid_args}"
                            ) from None

                    elif typ is bool:
                        val = argval in ['true', '1', 'True', 'y', 'yes']
                    else:
                        try:
                            val = ast.literal_eval(argval)
                        except ValueError:
                            raise ValueError(
                                f"Invalid value '{argval}' for argument '{argname}'"
                            ) from None
                    setattr(self, argname, val)
                    i += 2
                else:
                    raise ValueError(f"Invalid argument: '{arg}'")
            else:
                Logging.warn(f"Unrecognized command line argument: '{arg}'")
                i += 1

        if self.ipdb:
            # enter IPython debugger on exception
            from IPython.core import ultratb
            sys.excepthook = ultratb.FormattedTB(mode='Context',
                                                 color_scheme='Linux',
                                                 call_pdb=1)

        self.preprocess()
        self._validate()
        self.postprocess()
Пример #5
0
    def sampling_decode(self, vocab: Dict[str, Vocab], example: LRLMExample,
                        begin_symbol: int = 2, end_symbol: int = 5,
                        initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None,
                        max_length: int = 200, greedy: bool = False, topk: Optional[int] = None,
                        print_info: bool = True, color_outputs: bool = False, show_rel_type: bool = True,
                        sanity_check: bool = False, unkinfo: Optional[Tuple[Tensor, List[str]]] = None, **kwargs) \
            -> SampledOutput:
        r"""
        Sampling for LRLM.

        Output format:
        - Red words:       Copied from canonical form of entity.
        - Green words:     Copied from alias form of entity.
        - Yellow words:    Warm-up context.
        - word_[type]:     "word" is an entity of type "type".
        - @-@:             A dash in the original text without spaces around, e.g. M @-@ 82 => M-82.

        :param vocab: Vocabulary containing id2word mapping.
        :param example: The :class:`Example` object of the current topic.
        :param begin_symbol: Start of sentence symbol.
        :param end_symbol: End of sentence symbol. Sampling stops when this symbol is generated.
        :param initial_hidden: If not specified, default hidden states returned by :meth:`init_hidden` is used.
        :param warm_up: Number of tokens to provide as context before performing sampling.
        :param max_length: If generated sentence exceeds specified length, sampling is force terminated.
        :param greedy: If ``True``, use greedy decoding instead of sampling.
        :param topk: If not ``None``, only sample from indices with top-k probabilites.

        :param print_info: If ``True``, print information about sampled result.
        :param color_outputs: If ``True``, include annotations for each output token. Tokens from entities will be
            colored red.
        :param show_rel_type: If ``True``, show relation types for copied entities.

        :param sanity_check: If ``True``, perform sanity check on generated sample.

        :param unkinfo: Precomputed unkprobs and the index-to-vocabulary mapping.

        :return: A tuple of (loss_value, formatted list of words).
        """
        if unkinfo is not None:
            unkprob, unki2w = unkinfo
            unkprob = unkprob[self._vocab_size:]
            unki2w = unki2w[self._vocab_size:]
            normalized_unkprob = F.log_softmax(unkprob, dim=0)

        # noinspection PyPep8Naming
        UNK, INVALID, CANONICAL_IDX, WORD_PREDICTOR, REL_PREDICTOR, EPS = -100, -1, 0, 0, 1, 1e-4

        self.eval()
        self.init_hidden(1, [example.relations])

        word_vocab, rel_vocab = vocab['word'], vocab['rel']

        tensor = functools.partial(sample_utils.tensor, device=self.device)
        sample = functools.partial(sample_utils.sample,
                                   greedy=greedy,
                                   topk=topk)
        np_sample = functools.partial(sample_utils.np_sample,
                                      greedy=greedy,
                                      topk=topk)

        # noinspection PyShadowingNames
        def compute_loss(
                inputs: List[int],
                spans: List[MatchedSpan],
                hidden: Optional[HiddenState] = None
        ) -> Tuple[float, HiddenState]:
            batch = SimpleNamespace(
                sequence=tensor(inputs[:-1]),
                target=tensor(inputs[1:]),
                spans=[spans],
                unkprob=None,
                lengths=torch.tensor([len(inputs) - 1], device=self.device),
                ntokens=len(inputs) - 1,
            )
            loss, next_hidden = self.calc_loss(batch,
                                               hidden=hidden)  # type: ignore
            return loss.item(), next_hidden

        if warm_up is None:
            inputs = [begin_symbol]
            rel_ids = [INVALID]
            surface_indices = [INVALID]
            spans: List[MatchedSpan] = []
            total_log_prob = 0.0
            marginal_log_prob = 0.0
            hidden = initial_hidden
        else:
            inputs = list(word_vocab.numericalize(example.sentence[:warm_up]))
            rel_ids = [INVALID] * len(
                inputs)  # assume everything is generated from vocabulary
            surface_indices = [INVALID] * len(inputs)
            spans = [span for span in example.spans if span.end < warm_up]
            loss, hidden = compute_loss(inputs, spans, initial_hidden)
            total_log_prob = -loss * (len(inputs) - 1)
            marginal_log_prob = -loss * (len(inputs) - 1)

        while len(inputs) < max_length and inputs[-1] != end_symbol:
            computed_log_probs, new_hidden = self._compute_log_probs(
                tensor(inputs[-1]), hidden)
            predictor, selector_loss = sample(computed_log_probs.selector)

            if predictor == REL_PREDICTOR:
                rel_id, rel_loss = sample(computed_log_probs.rel[0])

                if self._alias_disamb is AliasDisamb.FastText:
                    assert computed_log_probs.alias_logits is not None
                    aliases = example.relations[rel_id].obj_alias
                    alias_vecs = self.alias_vec[aliases]
                    surface_log_prob = F.log_softmax(torch.mv(
                        alias_vecs, computed_log_probs.alias_logits.flatten()),
                                                     dim=0)
                    surface_idx, alias_loss = sample(surface_log_prob)
                    alias = self.alias_list[aliases[surface_idx]]
                else:
                    # can't tell which one under oracle, use the canonical (first) alias
                    surface_idx = 0
                    alias_loss = 0.0
                    alias = example.relations[rel_id].obj_alias[
                        0]  # type: ignore

                # forward the hidden state according to the generated in-vocab tokens
                raw_tokens: List[str] = alias.split()
                token_ids: List[int] = word_vocab.numericalize(raw_tokens)
                if len(raw_tokens) > 1:
                    _, new_hidden = self._compute_log_probs(
                        tensor(token_ids[:-1]), new_hidden)

                # compute marginal probability for current span
                span_inputs = tensor([inputs[-1]] + token_ids[:-1])
                span_computed_log_probs, _ = self._compute_log_probs(
                    span_inputs, hidden)
                word_gen_loss = torch.sum(
                    span_computed_log_probs.selector[0, :, WORD_PREDICTOR] +
                    torch.gather(span_computed_log_probs.word,
                                 index=tensor(token_ids).unsqueeze(-1),
                                 dim=2).flatten()).item()
                marginal_log_prob += torch.logsumexp(tensor(
                    [selector_loss + rel_loss + alias_loss, word_gen_loss]),
                                                     dim=1).item()

                spans.append(
                    MatchedSpan(
                        len(inputs) - 1,
                        len(inputs) + len(token_ids) - 1,
                        example.relations[rel_id].rel_typ, rel_id,
                        surface_idx))
                inputs.extend(token_ids)
                rel_ids.extend([rel_id] + [INVALID] * (len(token_ids) - 1))
                surface_indices.extend([surface_idx] + [INVALID] *
                                       (len(token_ids) - 1))

                total_log_prob += selector_loss + rel_loss + alias_loss
            elif predictor == WORD_PREDICTOR:
                word, word_loss = sample(computed_log_probs.word)
                total_log_prob += selector_loss + word_loss
                marginal_log_prob += selector_loss + word_loss

                if word == 0 and unkinfo is not None:  # unk
                    unk_idx, unk_loss = np_sample(normalized_unkprob)
                    total_log_prob += unk_loss
                    marginal_log_prob += unk_loss
                    # Ugly multi-purpose use of variables.
                    surface_indices.append(
                        unk_idx)  # Record unk word index in surface_indices.
                    rel_ids.append(UNK)  # Record UNK in rel_ids.
                else:
                    rel_ids.append(INVALID)
                    surface_indices.append(INVALID)

                inputs.append(word)
            else:
                raise ValueError

            hidden = new_hidden

        sample_loss = -total_log_prob / (len(inputs) - 1)
        marginal_loss = -marginal_log_prob / (len(inputs) - 1)
        if print_info:
            print(
                f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}"
            )
            print(
                f"Marginal sample loss: {marginal_loss:.3f}, PPL: {math.exp(marginal_loss):.3f}"
            )
        # Sanity checks
        if sanity_check:
            # noinspection PyTypeChecker
            loss_val, gold_hidden = compute_loss(inputs, spans, initial_hidden)
            assert hidden is not None
            hidden_state_diff = max(
                torch.max(torch.abs(g - h)).item()
                for g, h in zip(gold_hidden, hidden))
            if hidden_state_diff > EPS:
                Logging.warn(
                    f"Hidden states do not match. Difference: {hidden_state_diff}"
                )
            if abs(marginal_loss - loss_val) > EPS:
                Logging.warn(
                    f"Marginal loss values do not match. "
                    f"Forward loss: {loss_val}, difference: {abs(marginal_loss - loss_val)}"
                )

        num_rels_generated = sum(int(rel_id != INVALID) for rel_id in rel_ids)
        if print_info:
            print(
                f"Relations [Generated / Annotated]: "
                f"[{num_rels_generated} / {len([s for s in example.spans if s.end < max_length])}]"
            )

        words = []
        idx = 0
        copy_count = 0
        while idx < len(inputs):
            is_warm_up = (warm_up is not None and idx < warm_up)
            token_id, rel_id, surface_idx = inputs[idx], rel_ids[
                idx], surface_indices[idx]
            if rel_id == INVALID:
                token = word_vocab.i2w[token_id]
                idx += 1
            elif rel_id == UNK:
                token = Logging.color('blue', unki2w[surface_idx])
                idx += 1
            else:
                copy_count += 1
                word_id = example.relations[rel_id].obj_alias[
                    surface_idx]  # multiple words
                token = self.alias_list[word_id]
                idx += len(token.split())
                if show_rel_type:
                    token = f"{token}_[{rel_vocab.i2w[example.relations[rel_id].rel_typ]}]"
                if color_outputs and not is_warm_up:
                    token = Logging.color(
                        'red' if surface_idx == CANONICAL_IDX else 'green',
                        token)
            if color_outputs and is_warm_up:
                token = Logging.color('yellow', token)
            words.append(token)

        if print_info:
            print(f"# of copied entities: {copy_count}")

        output = SampledOutput(sentence=words,
                               sample_loss=sample_loss,
                               complete_copies=copy_count,
                               incomplete_copies=0)
        return output
Пример #6
0
def main():
    Logging.verbosity_level = Logging.VERBOSE

    Logging.warn("This program requires lots of memory (preferably >= 30GB).")

    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True)

    # Read the Wikimedia IDs for each article, and filter the relations
    topic_ids: Set[WikidataID] = set()
    split_title_id: Dict[str, List[Tuple[str, WikidataID]]] = {}
    for split in ['train', 'valid', 'test']:
        with utils.work_in_progress(f"Loading {split} set titles"), \
             open(TOPIC_JSON_PATH(split=split)) as f:
            j = json.load(f)
        split_title_id[split] = [(article['title'], WikidataID(article['id']))
                                 for article in j]
        topic_ids.update([wid for _, wid in split_title_id[split]])
        del j

    with utils.work_in_progress("Loading Wikidata ID mapping"):
        id2rel = load_id2str(WIKIDATA_DUMP_DIR / 'properties.txt')

    # Match the relations
    matched_dataset = read_data(ALIGNED_DATA_DIR)

    # Gather entities & relation vectors
    found_entities = set()
    found_rels = set()
    for split in matched_dataset:
        for example in matched_dataset[split]:
            found_entities.add(example.topic_id)
            for rel in example.relations:
                found_entities.add(rel.obj_id)
                found_rels.add(rel.rel_typ)
    found_entities -= {UNK_ENTITY}
    found_rels -= {NAF, ANCHOR, TOPIC_ITSELF}
    with utils.work_in_progress("Building rel vecs"):
        rel_map = load_relations(found_rels)
        rel_map.update({NAF: -1, ANCHOR: -2, TOPIC_ITSELF: -3})
        unk_rels = found_rels.difference(rel_map)
        # NOTE: unk_rels is a set, its order is undetermined, so we sort it to make sure it's consistent between runs
        for idx, rel in enumerate(sorted(unk_rels)):
            rel_map[rel] = -idx - 4  # starting from -4, going towards -inf
    with utils.work_in_progress("Building entity vecs"):
        entity_map = load_entities(found_entities)
        entity_map.update({UNK_ENTITY: -1})
        print(
            f"Topic ID coverage: {len(topic_ids.intersection(entity_map))}/{len(topic_ids)}"
        )

    # save relation type names for use during generation
    id_to_rel_name = dict(id2rel)
    id_to_rel_name.update({
        NAF: 'Not-A-Fact',
        ANCHOR: 'ANCHOR',
        TOPIC_ITSELF: 'TITLE'
    })
    rel_names: Dict[int, str] = {}
    for r_rel, rel_id in rel_map.items():
        rel_names[rel_id] = id_to_rel_name[r_rel]
    with (SAVE_DIR / 'rel_names.pkl').open('wb') as f:
        pickle.dump(rel_names, f)
        print(f"Relation names saved to {(SAVE_DIR / 'rel_names.pkl')}")

    # Convert into numbers to create the final dataset
    for split in matched_dataset:
        with utils.work_in_progress(f"Converting {split} set"):
            dataset, matched_spans = numericalize_rel(matched_dataset[split],
                                                      rel_map, entity_map)

        path = SAVE_DIR / f'{split}.pkl'
        with path.open('wb') as f:
            pickle.dump(dataset, f)
        print(
            f"Dataset split '{split}' saved to {path}, {len(dataset)} examples"
        )

        path = SAVE_DIR / f'{split}.span.pkl'
        with path.open('wb') as f:
            pickle.dump(matched_spans, f)
        print(f"Matched spans split '{split}' saved to {path}")
Пример #7
0
def read_data(path: Path) -> Dict[str, List[RawExampleWikiID]]:
    bad_examples: List[Tuple[str, int, str]] = []
    data = {}
    for split in ['train', 'valid', 'test']:
        with (path / f'{split}.pkl').open('rb') as f:
            # relation tuple: (span, rel_type_desc, name, canonical_name)
            with utils.work_in_progress(f"Loading {split} set"):
                dump: List[RawDump] = pickle.load(f)

            examples = []
            for idx, (sent, rels) in enumerate(
                    utils.progress(dump, desc='Reading data')):
                # map (rel_typ, canonical) to list of aliases, since lists aren't hashable
                rel_to_alias: Dict[Tuple[str, str], List[str]] = \
                    {(rel[0][0], obj_id): alias for obj_id, _, _, rel, _, alias in rels}

                # sort it so the order is consistent
                relations: List[RelationWikiID] = sorted([
                    RelationWikiID(WikidataID(rel_id), WikidataID(obj_id),
                                   obj_alias)
                    for (rel_id, obj_id), obj_alias in rel_to_alias.items()
                ])
                rel_to_id: Dict[Tuple[str, str], int] = {
                    (rel_id, obj_id): idx
                    for idx, (rel_id, obj_id,
                              obj_alias) in enumerate(relations)
                }
                # dedup to remove duplicate (-1, -1)
                mentions: List[EntityMention] = list(
                    set(
                        EntityMention(span, surface, rel_to_id[(rel_info[0][0],
                                                                obj_id)]) for
                        obj_id, head_id, span, rel_info, surface, _ in rels))
                try:
                    # must exist - head id with the relation: @TITLE@ is the topic WikidataID
                    topic_id = next(
                        head_id
                        for _, head_id, _, rel_info, surface, alias in rels
                        if rel_info[0][0] == "@TITLE@")
                except StopIteration:
                    bad_examples.append((split, idx, ' '.join(sent)[:100]))
                    continue

                converted_relations = []
                for r in relations:
                    converted_relations.append(
                        RelationWikiID(
                            TOPIC_ITSELF if r.rel_typ == "@TITLE@" else
                            r.rel_typ, r.obj, r.obj_alias))

                example = RawExampleWikiID(WikidataID(topic_id), sent,
                                           converted_relations, mentions)
                examples.append(example)
            data[split] = examples

    if len(bad_examples) > 0:
        Logging.warn(f"{len(bad_examples)} bad examples:\n"
                     f"{pprint.pformat(bad_examples)}")
    else:
        Logging.verbose("All examples are good")

    return data
Пример #8
0
    def sampling_decode(self, vocab: Dict[str, Vocab], example: NKLMExample,
                        begin_symbol: int = 2, end_symbol: int = 5,
                        initial_hidden: Optional[HiddenState] = None, warm_up: Optional[int] = None,
                        max_length: int = 200, greedy=False, topk=None,
                        fill_incomplete=False, allow_invalid_pos=False,
                        print_info=True, color_outputs=False, color_incomplete=True,
                        show_ellipses=True, show_rel_type=True, show_copy_pos=False,
                        sanity_check=False, unkinfo: Optional[Tuple[Tensor, List[str]]] = None, **kwargs) \
            -> SampledOutput:
        """
        Sampling for NKLM.

        Output format:
        - Red words:       Copied from canonical form of entity.
        - Green words:     Copied from alias form of entity.
        - Yellow words:    Warm-up context.
        - word_[type]:     "word" is an entity of type "type".
        - word...(a_b_c):  "word" is a partially copied entity with remaining suffix "a b c".
        - (a_b_c)...word:  "word" is a partially copied entity with remaining prefix "a b c".
        - @-@:             A dash in the original text without spaces around, e.g. M @-@ 82 => M-82.
        - <X>:             A token copied from an invalid position of an entity.

        :param vocab: Vocabulary containing id2word mapping.
        :param example: The :class:`Example` object of the current topic.
        :param begin_symbol: Start of sentence symbol.
        :param end_symbol: End of sentence symbol. Sampling stops when this symbol is generated.
        :param initial_hidden: If not specified, default hidden states returned by :meth:`init_hidden` is used.
        :param warm_up: Number of tokens to provide as context before performing sampling.
        :param max_length: If generated sentence exceeds specified length, sampling is force terminated.
        :param greedy: If ``True``, use greedy decoding instead of sampling.
        :param topk: If not ``None``, only sample from indices with top-k probabilites.
        :param fill_incomplete: If ``True``, entities that are partially copied will be completed.
        :param allow_invalid_pos: If ``True``, allowing copying from invalid positions, and use <unk> as input.

        :param print_info: If ``True``, print information about sampled result.
        :param color_outputs: If ``True``, include annotations for each output token. Tokens from entities will be
            colored red.
        :param color_incomplete: If ``True`` and ``color_outputs`` is ``True``, also color partially copied entities.
        :param show_ellipses: If ``True``, show ellipses at beginning or end of partially copied entities.
        :param show_rel_type: If ``True``, show relation types for copied entities.
        :param show_copy_pos: If ``True``, show the position from which the entity tokens are copied.

        :param sanity_check: If ``True``, perform sanity check on generated sample.

        :return: A tuple of (loss_value, formatted list of words).
        """

        if unkinfo is not None:
            unkprob, unki2w = unkinfo
            unkprob = unkprob[self._vocab_size:]
            unki2w = unki2w[self._vocab_size:]
            normalized_unkprob = F.log_softmax(unkprob, dim=0)

        # noinspection PyPep8Naming
        UNK, INVALID, UNK_TOKEN, CANONICAL_IDX, EPS = -100, -1, 0, 0, 1e-4

        self.eval()
        self.init_hidden(1, [example.relations])

        word_vocab, rel_vocab = vocab['word'], vocab['rel']

        tensor = functools.partial(sample_utils.tensor, device=self.device)
        randint = sample_utils.randint
        sample = functools.partial(sample_utils.sample,
                                   greedy=greedy,
                                   topk=topk)
        np_sample = functools.partial(sample_utils.np_sample,
                                      greedy=greedy,
                                      topk=topk)

        # noinspection PyShadowingNames
        def compute_loss(
                inputs: List[int],
                rel_ids: List[int],
                copy_pos: List[int],
                surface_indices: List[int],
                hidden: Optional[HiddenState] = None
        ) -> Tuple[float, HiddenState]:
            batch = SimpleNamespace(
                sequence=tensor(inputs[:-1]),
                target=tensor(inputs[1:]),
                unkprob=None,
                seqs={
                    'rel_ids': tensor(rel_ids),
                    'copy_pos': tensor(copy_pos),
                    'surface_indices': tensor(surface_indices)
                },
                ntokens=len(inputs) - 1,
            )
            loss, next_hidden = self.calc_loss(batch,
                                               hidden=hidden)  # type: ignore
            return loss.item(), next_hidden

        # Initialization
        if warm_up is None:
            inputs = [begin_symbol]
            rel_ids = [INVALID]
            copy_pos = [INVALID]
            surface_indices = [INVALID]
            total_log_prob = 0.0
            hidden = initial_hidden
        else:
            inputs = list(word_vocab.numericalize(example.sentence[:warm_up]))
            rel_ids = list(example.rel_ids[:warm_up])
            copy_pos = list(example.copy_pos[:warm_up])
            surface_indices = list(example.surface_indices[:warm_up])
            total_log_prob, hidden = compute_loss(inputs, rel_ids, copy_pos,
                                                  surface_indices,
                                                  initial_hidden)
            total_log_prob = -total_log_prob * (len(inputs) - 1)

        # Sampling procedure
        while len(inputs) < max_length and inputs[-1] != end_symbol:
            fact_log_probs, output, _, next_hidden = \
                self._compute_fact_log_probs(tensor(inputs[-1]), tensor(rel_ids[-1]), tensor(copy_pos[-1]), hidden)
            rel_id, fact_loss = sample(fact_log_probs[0])
            rel_id -= 1
            total_log_prob += fact_loss

            # next_fact_embed: (1, 1, fact_embed_dim)
            next_fact_embed = self._get_fact_embeds(tensor(rel_id))
            copy_indicator, alias_log_probs, pos_log_probs, vocab_log_probs = \
                self._compute_generate_log_probs(output, next_fact_embed, tensor([rel_ids[-1], rel_id]))
            if torch.bernoulli(copy_indicator).item():
                total_log_prob += torch.log(copy_indicator).item()
                assert rel_id != -1
                # copy entity
                aliases = example.relations[rel_id].obj_alias
                if self._alias_disamb is AliasDisamb.FastText:
                    assert alias_log_probs is not None
                    surface_idx, surface_loss = sample(alias_log_probs[0])
                else:
                    surface_idx, surface_loss = 0, 0.0
                alias = self.alias_list[aliases[surface_idx]]

                entity: List[str] = alias.split()
                # normalization not required
                # TODO: keep consistent with _mask_invalid_pos setting
                pos, pos_loss = sample(pos_log_probs if allow_invalid_pos else
                                       pos_log_probs.squeeze()[:len(entity)])
                if self._mask_invalid_pos:
                    pos_loss -= torch.logsumexp(
                        pos_log_probs.squeeze()[:len(entity)], dim=0).item()
                total_log_prob += surface_loss + pos_loss
                token = UNK_TOKEN if pos >= len(
                    entity) else word_vocab.w2i.get(entity[pos], UNK_TOKEN)
            else:
                total_log_prob += torch.log(1.0 - copy_indicator).item()
                assert rel_id == -1
                # generate word
                token, token_loss = sample(vocab_log_probs)
                total_log_prob += token_loss
                pos = INVALID
                surface_idx = INVALID

                if token == 0 and unkinfo is not None:  # unk
                    unk_idx, unk_loss = np_sample(normalized_unkprob)
                    total_log_prob += unk_loss
                    # Ugly multi-purpose use of variables.
                    surface_idx = unk_idx  # Record unk word index in surface_indices.
                    rel_id = UNK  # Record UNK in rel_ids.

            inputs.append(token)
            rel_ids.append(rel_id)
            copy_pos.append(pos)
            surface_indices.append(surface_idx)
            hidden = next_hidden

        sample_loss = -total_log_prob / (len(inputs) - 1)
        if print_info:
            print(
                f"Sample loss: {sample_loss:.3f}, PPL: {math.exp(sample_loss):.3f}"
            )
        # Sanity checks
        if sanity_check:
            loss_val, gold_hidden = compute_loss(inputs, rel_ids, copy_pos,
                                                 surface_indices,
                                                 initial_hidden)
            assert hidden is not None
            hidden_state_diff = max(
                torch.max(torch.abs(g - h)).item()
                for g, h in zip(gold_hidden, hidden))
            if hidden_state_diff > EPS:
                Logging.warn(
                    f"Hidden states do not match. Difference: {hidden_state_diff}"
                )
            if abs(sample_loss - loss_val) > EPS:
                Logging.warn(
                    f"Loss values do not match. "
                    f"Forward loss: {loss_val}, difference: {abs(sample_loss - loss_val)}"
                )

        # Format the output
        sentence = list(zip(inputs, rel_ids, copy_pos, surface_indices))
        words = []
        copy_count = 0
        complete_count = 0
        last_entity = None
        entity_continuing = False
        for idx, (token, rel_id, pos, surface_idx) in enumerate(sentence):
            is_warm_up = (warm_up is not None and idx < warm_up)
            if rel_id == INVALID:
                word = word_vocab.i2w[token]
            elif rel_id == UNK:
                word = Logging.color('blue', unki2w[surface_idx])
            else:
                copy_count += 1
                entity_id = example.relations[rel_id].obj_alias[surface_idx]
                entity = self.alias_list[entity_id].split()
                if pos >= len(entity):
                    word = "<X>"
                else:
                    word = entity[pos]
                if show_copy_pos:
                    word = f"{pos}_{rel_id}_{surface_idx}_{word}"
                is_last_word_in_entity = (idx == len(sentence) - 1
                                          or sentence[idx + 1][1:] !=
                                          (rel_id, pos + 1, surface_idx))
                is_first_word_in_entity = (idx == 0 or sentence[idx - 1][1:] !=
                                           (rel_id, pos - 1, surface_idx))
                # add entity tag after the last word
                if show_rel_type and is_last_word_in_entity:
                    word = f"{word}_[{rel_vocab.i2w[example.relations[rel_id].rel_typ]}]"

                # check whether fully copied
                if show_ellipses:
                    if pos < len(entity) - 1 and is_last_word_in_entity:
                        word = word + '...' + (
                            f"({'_'.join(entity[(pos + 1):])})"
                            if fill_incomplete else "")
                    if pos > 0 and is_first_word_in_entity:
                        word = (f"({'_'.join(entity[:pos])})"
                                if fill_incomplete else "") + '...' + word

                if entity_continuing:
                    if last_entity == (rel_id, surface_idx,
                                       pos - 1):  # Continuing
                        last_entity = (rel_id, surface_idx, pos)
                    else:
                        entity_continuing = False
                        last_entity = None

                if pos == 0 and not entity_continuing:  # reset
                    entity_continuing = True
                    last_entity = (rel_id, surface_idx, 0)

                if color_outputs and not is_warm_up and (color_incomplete
                                                         or entity_continuing):
                    word = Logging.color(
                        'red' if surface_idx == 0 else 'green', word)

                if pos == len(entity) - 1 and entity_continuing:  # commit
                    entity_continuing = False
                    complete_count += 1

            if color_outputs and is_warm_up:
                word = Logging.color('yellow', word)
            words.append(word)

        if print_info:
            print(f"Copied, Completed: {copy_count}, {complete_count}")
        sampled_output = SampledOutput(sentence=words,
                                       sample_loss=sample_loss,
                                       complete_copies=complete_count,
                                       incomplete_copies=copy_count)
        return sampled_output