def encode_ner_y(y_ner_list_train, y_ner_list_test, CLASS_COUNT_DICT):
    y_ner_encoder = LabelEncoder(sample=CLASS_COUNT_DICT.keys())
    y_ner_encoded_train = [[
        y_ner_encoder.encode(label) for label in label_list
    ] for label_list in y_ner_list_train]
    y_ner_encoded_train = [torch.stack(tens) for tens in y_ner_encoded_train]
    y_ner_padded_train = torch.LongTensor(
        pad_sequence(y_ner_encoded_train, MAX_SENTENCE_LEN + 1))

    y_ner_encoded_test = [[
        y_ner_encoder.encode(label) for label in label_list
    ] for label_list in y_ner_list_test]
    y_ner_encoded_test = [torch.stack(tens) for tens in y_ner_encoded_test]
    y_ner_padded_test = torch.LongTensor(
        pad_sequence(y_ner_encoded_test, MAX_SENTENCE_LEN + 1))

    if y_ner_padded_train.shape[1] > y_ner_padded_test.shape[1]:
        y_ner_padded_test = torch.cat(
            (
                y_ner_padded_test,
                torch.zeros(
                    y_ner_padded_test.shape[0],
                    y_ner_padded_train.shape[1] - y_ner_padded_test.shape[1],
                ),
            ),
            dim=1,
        ).type(torch.long)

    return y_ner_padded_train, y_ner_padded_test
Exemplo n.º 2
0
def test_label_encoder_no_reserved():
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    label_encoder = LabelEncoder(sample,
                                 reserved_labels=[],
                                 unknown_index=None)

    label_encoder.encode('people/deceased_person/place_of_death')

    # No ``unknown_index`` defined causes ``RuntimeError`` if an unknown label is used.
    with pytest.raises(TypeError):
        label_encoder.encode('symbols/namesake/named_after')
Exemplo n.º 3
0
def test_label_encoder_known(label_encoder):
    input_ = 'symbols/namesake/named_after'
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    sample.append(input_)
    label_encoder = LabelEncoder(sample)
    output = label_encoder.encode(input_)
    assert label_encoder.decode(output) == input_
Exemplo n.º 4
0
def encode_ner_y(y_ner_list_train, y_ner_list_test, class_count_dict, max_sent_len):
    """
    Tokenize y
    :param y_ner_list_train:
    :param y_ner_list_test:
    :param class_count_dict:
    :param max_sent_len:
    :return:
    """
    y_ner_encoder = LabelEncoder(sample=class_count_dict.keys())
    y_ner_encoded_train = [
        [y_ner_encoder.encode(label) for label in label_list]
        for label_list in y_ner_list_train
    ]
    y_ner_encoded_train = [torch.stack(tens) for tens in y_ner_encoded_train]
    y_ner_padded_train = torch.LongTensor(
        pad_sequence(y_ner_encoded_train, max_sent_len + 1)
    )

    y_ner_encoded_test = [
        [y_ner_encoder.encode(label) for label in label_list]
        for label_list in y_ner_list_test
    ]
    y_ner_encoded_test = [torch.stack(tens) for tens in y_ner_encoded_test]
    y_ner_padded_test = torch.LongTensor(
        pad_sequence(y_ner_encoded_test, max_sent_len + 1)
    )

    if y_ner_padded_train.shape[1] > y_ner_padded_test.shape[1]:
        y_ner_padded_test = torch.cat(
            (
                y_ner_padded_test,
                torch.zeros(
                    y_ner_padded_test.shape[0],
                    y_ner_padded_train.shape[1] - y_ner_padded_test.shape[1],
                ),
            ),
            dim=1,
        ).type(torch.long)

    return y_ner_encoder, y_ner_padded_train, y_ner_padded_test
Exemplo n.º 5
0
# Make Encoders
sentence_corpus = [row['premise'] for row in itertools.chain(train, dev, test)]
sentence_corpus += [
    row['hypothesis'] for row in itertools.chain(train, dev, test)
]
sentence_encoder = WhitespaceEncoder(sentence_corpus)

label_corpus = [row['label'] for row in itertools.chain(train, dev, test)]
label_encoder = LabelEncoder(label_corpus)

# Encode
for row in itertools.chain(train, dev, test):
    row['premise'] = sentence_encoder.encode(row['premise'])
    row['hypothesis'] = sentence_encoder.encode(row['hypothesis'])
    row['label'] = label_encoder.encode(row['label'])

config = args
config.n_embed = sentence_encoder.vocab_size
config.d_out = label_encoder.vocab_size
config.n_cells = config.n_layers

# double the number of cells for bidirectional networks
if config.birnn:
    config.n_cells *= 2

if args.resume_snapshot:
    model = torch.load(
        args.resume_snapshot,
        map_location=lambda storage, location: storage.cuda(args.gpu))
else:
Exemplo n.º 6
0
class ActionSequenceEncoder:
    def __init__(self, samples: Samples, token_threshold: int):
        reserved_labels: List[Union[Unknown,
                                    CloseVariadicFieldRule]] = [Unknown()]
        reserved_labels.append(CloseVariadicFieldRule())
        self._rule_encoder = LabelEncoder(samples.rules,
                                          reserved_labels=reserved_labels,
                                          unknown_index=0)
        self._node_type_encoder = LabelEncoder(samples.node_types)
        reserved_labels = [Unknown()]
        self._token_encoder = LabelEncoder(samples.tokens,
                                           min_occurrences=token_threshold,
                                           reserved_labels=reserved_labels,
                                           unknown_index=0)
        self.value_to_idx: Dict[str, List[int]] = {}
        for kind, value in self._token_encoder.vocab[len(reserved_labels):]:
            idx = self._token_encoder.encode((kind, value))
            if value not in self.value_to_idx:
                self.value_to_idx[value] = []
            self.value_to_idx[value].append(idx)

    def decode(self, tensor: torch.LongTensor, reference: List[Token]) \
            -> Optional[ActionSequence]:
        """
        Return the action sequence corresponding to the tensor

        Parameters
        ----------
        tensor: torch.LongTensor
            The encoded tensor with the shape of
            (len(action_sequence), 3). Each action will be encoded by the tuple
            of (ID of the applied rule, ID of the inserted token,
            the index of the word copied from the reference).
            The padding value should be -1.
        reference

        Returns
        -------
        Optional[action_sequence]
            The action sequence corresponding to the tensor
            None if the action sequence cannot be generated.
        """

        retval = ActionSequence()
        for i in range(tensor.shape[0]):
            if tensor[i, 0] > 0:
                # ApplyRule
                rule = self._rule_encoder.decode(tensor[i, 0])
                retval.eval(ApplyRule(rule))
            elif tensor[i, 1] > 0:
                # GenerateToken
                kind, value = self._token_encoder.decode(tensor[i, 1])
                retval.eval(GenerateToken(kind, value))
            elif tensor[i, 2] >= 0:
                # GenerateToken (Copy)
                index = int(tensor[i, 2].numpy())
                if index >= len(reference):
                    logger.debug("reference index is out-of-bounds")
                    return None
                token = reference[index]
                retval.eval(GenerateToken(token.kind, token.raw_value))
            else:
                logger.debug("invalid actions")
                return None

        return retval

    def encode_action(self,
                      action_sequence: ActionSequence,
                      reference: List[Token]) \
            -> Optional[torch.Tensor]:
        """
        Return the tensor encoded the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        reference

        Returns
        -------
        Optional[torch.Tensor]
            The encoded tensor. The shape of tensor is
            (len(action_sequence) + 1, 4). Each action will be encoded by
            the tuple of (ID of the node types, ID of the applied rule,
            ID of the inserted token, the index of the word copied from
            the reference. The padding value should be -1.
            None if the action sequence cannot be encoded.
        """
        reference_value = [token.raw_value for token in reference]
        action = \
            torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \
            * -1
        for i in range(len(action_sequence.action_sequence)):
            a = action_sequence.action_sequence[i]
            parent = action_sequence.parent(i)
            if parent is not None:
                parent_action = \
                    cast(ApplyRule,
                         action_sequence.action_sequence[parent.action])
                parent_rule = cast(ExpandTreeRule, parent_action.rule)
                action[i, 0] = self._node_type_encoder.encode(
                    parent_rule.children[parent.field][1])

            if isinstance(a, ApplyRule):
                rule = a.rule
                action[i, 1] = self._rule_encoder.encode(rule)
            else:
                encoded_token = \
                    int(self._token_encoder.encode((a.kind, a.value)).numpy())

                if encoded_token != 0:
                    action[i, 2] = encoded_token

                # Unknown token
                if a.value in reference_value:
                    # TODO use kind in reference
                    action[i, 3] = \
                        reference_value.index(cast(str, a.value))

                if encoded_token == 0 and \
                        a.value not in reference_value:
                    logger.debug("cannot encode token")
                    return None

        head = action_sequence.head
        length = len(action_sequence.action_sequence)
        if head is not None:
            head_action = \
                cast(ApplyRule,
                     action_sequence.action_sequence[head.action])
            head_rule = cast(ExpandTreeRule, head_action.rule)
            action[length, 0] = self._node_type_encoder.encode(
                head_rule.children[head.field][1])

        return action

    def encode_raw_value(self, text: str) -> List[int]:
        if text in self.value_to_idx:
            return self.value_to_idx[text]
        else:
            return [self._token_encoder.encode(Unknown()).item()]

    def batch_encode_raw_value(self, texts: List[str]) -> List[List[int]]:
        return [self.encode_raw_value(text) for text in texts]

    def encode_parent(self, action_sequence) -> torch.Tensor:
        """
        Return the tensor encoded the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of `action` tensor is
            (len(action_sequence) + 1, 4). Each action will be encoded by
            the tuple of (ID of the parent node types, ID of the
            parent-action's rule, the index of the parent action,
            the index of the field).
            The padding value should be -1.
        """
        parent_tensor = \
            torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \
            * -1

        for i in range(len(action_sequence.action_sequence)):
            parent = action_sequence.parent(i)
            if parent is not None:
                parent_action = \
                    cast(ApplyRule,
                         action_sequence.action_sequence[parent.action])
                parent_rule = cast(ExpandTreeRule, parent_action.rule)
                parent_tensor[i, 0] = \
                    self._node_type_encoder.encode(parent_rule.parent)
                parent_tensor[i, 1] = self._rule_encoder.encode(parent_rule)
                parent_tensor[i, 2] = parent.action
                parent_tensor[i, 3] = parent.field

        head = action_sequence.head
        length = len(action_sequence.action_sequence)
        if head is not None:
            head_action = \
                cast(ApplyRule,
                     action_sequence.action_sequence[head.action])
            head_rule = cast(ExpandTreeRule, head_action.rule)
            parent_tensor[length, 0] = \
                self._node_type_encoder.encode(head_rule.parent)
            parent_tensor[length, 1] = self._rule_encoder.encode(head_rule)
            parent_tensor[length, 2] = head.action
            parent_tensor[length, 3] = head.field

        return parent_tensor

    def encode_tree(self, action_sequence: ActionSequence) \
            -> Union[torch.Tensor, torch.Tensor]:
        """
        Return the tensor adjacency matrix of the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded

        Returns
        -------
        depth: torch.Tensor
            The depth of each action. The shape is (len(action_sequence),).
        adjacency_matrix: torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), len(action_sequence)). If i th action is
            a parent of j th action, (i, j) element will be 1. the element
            will be 0 otherwise.
        """
        L = len(action_sequence.action_sequence)
        depth = torch.zeros(L)
        m = torch.zeros(L, L)

        for i in range(L):
            p = action_sequence.parent(i)
            if p is not None:
                depth[i] = depth[p.action] + 1
                m[p.action, i] = 1

        return depth, m

    def encode_each_action(self,
                           action_sequence: ActionSequence,
                           reference: List[Token],
                           max_arity: int) \
            -> torch.Tensor:
        """
        Return the tensor encoding the each action

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        reference
        max_arity: int

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), max_arity + 1, 3).
            [:, 0, 0] encodes the parent node type. [:, i, 0] encodes
            the node type of (i - 1)-th child node. [:, i, 1] encodes
            the token of (i - 1)-th child node. [:, i, 2] encodes the reference
            index of (i - 1)-th child node.
            The padding value is -1.
        """
        L = len(action_sequence.action_sequence)
        reference_value = [token.raw_value for token in reference]
        retval = torch.ones(L, max_arity + 1, 3).long() * -1
        for i, action in enumerate(action_sequence.action_sequence):
            if isinstance(action, ApplyRule):
                if isinstance(action.rule, ExpandTreeRule):
                    # Encode parent
                    retval[i, 0, 0] = \
                        self._node_type_encoder.encode(action.rule.parent)
                    # Encode children
                    for j, (_, child) in enumerate(
                            action.rule.children[:max_arity]):
                        retval[i, j + 1, 0] = \
                            self._node_type_encoder.encode(child)
            else:
                gentoken: GenerateToken = action
                kind = gentoken.kind
                value = gentoken.value
                encoded_token = \
                    int(self._token_encoder.encode((kind, value)).numpy())

                if encoded_token != 0:
                    retval[i, 1, 1] = encoded_token

                if value in reference_value:
                    # TODO use kind in reference
                    retval[i, 1, 2] = \
                        reference_value.index(cast(str, value))

        return retval

    def encode_path(self, action_sequence: ActionSequence, max_depth: int) \
            -> torch.Tensor:
        """
        Return the tensor encoding the each action

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        max_depth: int

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), max_depth).
            [i, :] encodes the path from the root node to i-th node.
            Each node represented by the rule id.
            The padding value is -1.
        """
        L = len(action_sequence.action_sequence)
        retval = torch.ones(L, max_depth).long() * -1
        for i in range(L):
            parent_opt = action_sequence.parent(i)
            if parent_opt is not None:
                p = action_sequence.action_sequence[parent_opt.action]
                if isinstance(p, ApplyRule):
                    retval[i, 0] = self._rule_encoder.encode(p.rule)
                retval[i, 1:] = retval[parent_opt.action, :max_depth - 1]

        return retval
class WikiDataset(Dataset):
    '''
    A custom dataset object that encodes a tokenized text and its labels according to the corresponding encoders 
    '''
    def __init__(self,
                 json,
                 text_encoder=None,
                 label_encoder=None,
                 vocab=None,
                 mode='train'):
        '''
        Initialization
        Arguments:
        json: Json file containing the data. 
            Structure of json file:
            e.g: 
                 json: {'data' : [{'id': filename,
                                   'title': title of page,
                                   'toc': [list of items in table of contents section of wikipage],
                                   'intro':introduction of wiki page,
                                   'label':'positive'/'negative' flag}]
                        }
            Labels-required only when mode = 'train'
        text_encoder: encoder object that encodes tokens to their unique integer ids
        label_encoder: encoder object that encodes labels to their unique integer ids
        vocab: external vocabulary used to intialize the text encoder. If vocab = None, it would be generated based on tokens from the datasets provided
        mode: 'train' or 'inference': in case of mode == 'inference', the dataset object skips the labels
        '''
        self.data = json
        assert 'data' in self.data

        # Define the mode in which the dataset object is to be used
        self.mode = mode

        # Define text encoder and vocabulary
        if text_encoder:
            self._text_encoder = text_encoder
            self._vocab = self._text_encoder.vocab
        elif vocab:
            self._vocab = vocab
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)
        else:
            self._vocab = self.create_vocab()
            self._text_encoder = StaticTokenizerEncoder(self._vocab,
                                                        append_eos=False,
                                                        tokenize=self.split)

        self._vocab_size = self._text_encoder.vocab_size

        # Define label encoder
        if self.mode == 'train':
            if label_encoder:
                self._label_encoder = label_encoder
            else:
                self._label_encoder = LabelEncoder(
                    [sample['label'] for sample in self.data['data']])

            self._label_size = self._label_encoder.vocab_size

        else:
            self._label_encoder = None
            self._label_size = None

    def __len__(self):
        '''
        Size of dataset
        '''
        return len(self.data['data'])

    def __getitem__(self, idx):
        '''
        Extract item corresponding to idx'th index in data
        '''
        item = self.data['data'][idx]

        intro_enc = self._text_encoder.encode(item['intro'])

        toc = item['toc']
        if toc == []:
            toc_enc = self._text_encoder.encode('.')
        else:
            toc = ' '.join(toc)
            toc_enc = self._text_encoder.encode(toc)

        title_enc = self._text_encoder.encode(item['title'])

        if self.mode == 'train':
            return title_enc, toc_enc, intro_enc, self._label_encoder.encode(
                item['label']).view(-1)
        else:
            return title_enc, toc_enc, intro_enc

    @property
    def vocab_size(self):
        return self._vocab_size

    @property
    def label_size(self):
        return self._label_size

    @property
    def text_encoder(self):
        return self._text_encoder

    @property
    def label_encoder(self):
        return self._label_encoder

    @property
    def vocab(self):
        return self._vocab

    def create_vocab(self, remove_less_freq_words=True, threshold=1):
        '''
        Creates vocabulary from the dataset tokens

        Returns:
        List of unique tokens in dataset
        '''
        temp_vocab = []
        for sample in self.data['data']:
            temp_vocab.extend(sample['title'].split())
            temp_vocab.extend(' '.join(sample['toc']).split())
            temp_vocab.extend(sample['intro'].split())

        vocab = []

        if remove_less_freq_words:

            count_dict = collections.Counter(temp_vocab)

            for word in count_dict.keys():
                if count_dict[word] > threshold:
                    vocab.append(word)

        else:
            vocab = sorted(list(set(temp_vocab)))

        return vocab

    def split(self, x):
        '''
        Splits the text into tokens 
        '''
        return x.split()

    def collate_fn(self, batch, padding=True):
        """
        Collate function needs to be passed to the pytorch dataloader

        Returns:
        (title,title_lengths): tuple containing padded sequence tensor for title and sequence lengths 
        (toc,toc_lengths): tuple containing padded sequence tensor for table of contents and sequence lengths 
        (intro,intro_lengths): tuple containing padded sequence tensor for introduction and sequence lengths 
        labels: tensor containing labels for the batch
        """
        if self.mode == 'train':
            title, toc, intro, labels = zip(*batch)
            labels = torch.cat(labels)
        else:
            title, toc, intro = zip(*batch)

        if isinstance(intro, collections.Sequence):

            if padding:
                title, title_lengths = stack_and_pad_tensors(title)
                toc, toc_lengths = stack_and_pad_tensors(toc)
                intro, intro_lengths = stack_and_pad_tensors(intro)

            if self.mode == 'train':
                return (title,
                        title_lengths), (toc,
                                         toc_lengths), (intro,
                                                        intro_lengths), labels
            else:
                return (title, title_lengths), (toc,
                                                toc_lengths), (intro,
                                                               intro_lengths)
        else:
            return batch

    @classmethod
    def fromJsonFile(cls,
                     json_file,
                     text_encoder=None,
                     label_encoder=None,
                     vocab=None,
                     mode='train'):
        '''
        Read data from json file

        Arguments:
        json_file: string specifying location to json_file
        '''
        with open(json_file, 'r') as f:
            json_data = json.load(f)

        return cls(json_data, text_encoder, label_encoder, vocab, mode)