Exemplo n.º 1
0
 def from_pytt(cls, fields, *, is_grad=False) -> "Activations":
     """Create Activations from the output tuples produced by PyTorch Transformers.
     Includes converting torch tensors to xp, and handling missing values.
     """
     # lh: last hidden
     # po: pooler_output
     # ah: all_hidden
     # aa: all_attention
     if len(fields) != 4:
         lh = fields[0]
         po = tuple()
         ah = []
         aa = []
     else:
         lh, po, ah, aa = fields
     # Convert last_hidden_state to xp
     lh = torch2xp(lh)
     xp = get_array_module(lh)
     # Normalize "None" value for pooler output
     if isinstance(po, tuple):
         po = xp.zeros((0, ), dtype=lh.dtype)
     else:
         po = torch2xp(po)
     ah = list(map(torch2xp, ah))
     aa = list(map(torch2xp, aa))
     return cls(lh, po, ah, aa, is_grad=is_grad)
Exemplo n.º 2
0
 def join(cls, sub_acts: List["Activations"]) -> "Activations":
     """Concatenate activations from subsequences."""
     xp = get_array_module(sub_acts[0].lh)
     lh: Array = xp.vstack([x.lh for x in sub_acts])
     po: Array = xp.vstack([x.po for x in sub_acts])
     # Transpose the lists, so that the inner list items refer
     # to the subsequences. Then we can vstack those.
     ah = list(map(xp.vstack, zip(*[x.ah for x in sub_acts])))
     #aa = list(map(xp.vstack, zip(*[x.aa for x in sub_acts])))
     aa = []
     return cls(lh, po, ah, aa, is_grad=sub_acts[0].is_grad)
Exemplo n.º 3
0
def pad_batch_activations(batch: List[Activations],
                          *,
                          to: int = 0) -> Activations:
    if not batch:
        return Activations.blank()
    xp = get_array_module(batch[0])
    lh = pad_batch([x.lh for x in batch], xp=xp, to=to)
    if lh.size:
        lh = lh.reshape((len(batch), -1, lh.shape[-1]))
    po = pad_batch([x.po for x in batch], xp=xp, to=to)
    if po.size:
        po = po.reshape((len(batch), -1, po.shape[-1]))
    # Transpose the lists, and then pad_batch the items
    ah = [
        pad_batch(list(seq), xp=xp, to=to)
        for seq in zip(*[x.ah for x in batch])
    ]
    aa = [
        pad_batch(list(seq), xp=xp, to=to)
        for seq in zip(*[x.aa for x in batch])
    ]
    return Activations(lh, po, ah, aa, is_grad=batch[0].is_grad)
def get_similarity_via_tensor(doc1, doc2):
    v1 = doc1.vector
    v2 = doc2.vector
    xp = get_array_module(v1)
    return xp.dot(v1, v2) / (doc1.vector_norm * doc2.vector_norm)
    def set_annotations(self, docs, activations):
        """Assign the extracted features to the Doc objects and overwrite the
        vector and similarity hooks.

        docs (iterable): A batch of `Doc` objects.
        activations (iterable): A batch of activations.
        """
        for doc, doc_acts in zip(docs, activations):
            xp = get_array_module(doc_acts.lh)
            wp_tensor = doc_acts.lh
            doc.tensor = self.model.ops.allocate((len(doc), self.model.nO))
            doc._.pytt_last_hidden_state = wp_tensor
            doc._.pytt_pooler_output = doc_acts.po
            doc._.pytt_all_hidden_states = doc_acts.ah
            doc._.pytt_all_attentions = doc_acts.aa
            doc._.pytt_d_last_hidden_state = xp.zeros((0, ),
                                                      dtype=wp_tensor.dtype)
            doc._.pytt_d_pooler_output = xp.zeros((0, ), dtype=wp_tensor.dtype)
            doc._.pytt_d_all_hidden_states = []
            doc._.pytt_d_all_attentions = []
            if wp_tensor.shape != (len(doc._.pytt_word_pieces), self.model.nO):
                print("# word pieces: ", len(doc._.pytt_word_pieces))
                print("# tensor rows: ", wp_tensor.shape[0])
                for sent in doc.sents:
                    if sent._.pytt_start is None or sent._.pytt_end is None:
                        print("Text: ", sent.text)
                        print("WPs: ", sent._.pytt_word_pieces_)
                        print(sent._.pytt_start, sent._.pytt_end)
                raise ValueError(
                    "Mismatch between tensor shape and word pieces. This usually "
                    "means we did something wrong in the sentence reshaping, "
                    "or possibly finding the separator tokens.")
            # Count how often each word-piece token is represented. This allows
            # a weighted sum, so that we can make sure doc.tensor.sum()
            # equals wp_tensor.sum().
            # TODO: Obviously incrementing the rows individually is bad. Need
            # to make this more efficient. Maybe just copy to CPU, do our stuff,
            # copy back to GPU?
            align_sizes = [0 for _ in range(len(doc._.pytt_word_pieces))]
            for word_piece_slice in doc._.pytt_alignment:
                for i in word_piece_slice:
                    align_sizes[i] += 1
            for i, word_piece_slice in enumerate(doc._.pytt_alignment):
                for j in word_piece_slice:
                    doc.tensor[i] += wp_tensor[j] / align_sizes[j]
            # To make this weighting work, we "align" the boundary tokens against
            # every token in their sentence.
            if doc.tensor.sum() != wp_tensor.sum():
                for sent in doc.sents:
                    if sent._.pytt_start is not None and sent._.pytt_end is not None:
                        cls_vector = wp_tensor[sent._.pytt_start]
                        sep_vector = wp_tensor[sent._.pytt_end]
                        doc.tensor[sent.start:sent.end +
                                   1] += cls_vector / len(sent)
                        doc.tensor[sent.start:sent.end +
                                   1] += sep_vector / len(sent)
            doc.user_hooks["vector"] = get_doc_vector_via_tensor
            doc.user_span_hooks["vector"] = get_span_vector_via_tensor
            doc.user_token_hooks["vector"] = get_token_vector_via_tensor
            doc.user_hooks["similarity"] = get_similarity_via_tensor
            doc.user_span_hooks["similarity"] = get_similarity_via_tensor
            doc.user_token_hooks["similarity"] = get_similarity_via_tensor
 def join(cls, sub_acts: List["Activations"]) -> "Activations":
     """Concatenate activations from subsequences."""
     xp = get_array_module(sub_acts[0].lh)
     lh: Array = xp.vstack([x.lh for x in sub_acts])
     return cls(lh, [], [], [], is_grad=sub_acts[0].is_grad)
def pad_batch_activations(batch: List[Activations]) -> Activations:
    xp = get_array_module(batch[0])
    lh = pad_batch([x.lh for x in batch], xp=xp)
    lh = lh.reshape((len(batch), -1, lh.shape[-1]))
    return Activations(lh, [], [], [], is_grad=batch[0].is_grad)