예제 #1
0
파일: bert.py 프로젝트: samsonleegh/Malaya
 def _vectorize(self, strings, method='first'):
     method = method.lower()
     if method not in ['first', 'last', 'mean', 'word']:
         raise ValueError(
             "method not supported, only support 'first', 'last', 'mean' and 'word'"
         )
     input_ids, input_masks, _, s_tokens = bert_tokenization(
         self._tokenizer, strings)
     v = self._sess.run(
         self._vectorizer,
         feed_dict={
             self._X: input_ids,
             self._input_masks: input_masks
         },
     )
     if method == 'first':
         v = v[:, 0]
     elif method == 'last':
         v = v[:, -1]
     elif method == 'mean':
         v = np.mean(v, axis=1)
     else:
         v = [
             merge_sentencepiece_tokens(
                 list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                 weighted=False,
                 vectorize=True,
             ) for i in range(len(v))
         ]
     return v
예제 #2
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def vectorize(self, string: str):
        """
        vectorize a string.

        Parameters
        ----------
        string: List[str]

        Returns
        -------
        result: np.array
        """
        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, [string], space_after_punct=True)
        s_tokens = s_tokens[0]
        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['vectorizer'],
        )
        v = r['vectorizer']
        v = v[0]
        return merge_sentencepiece_tokens(
            list(zip(s_tokens, v[:len(s_tokens)])),
            weighted=False,
            vectorize=True,
            model='xlnet',
        )
예제 #3
0
    def attention(self, strings, method='last', **kwargs):
        """
        Get attention string inputs from bert attention.

        Parameters
        ----------
        strings : str / list of str
        method : str, optional (default='last')
            Attention layer supported. Allowed values:

            * ``'last'`` - attention from last layer.
            * ``'first'`` - attention from first layer.
            * ``'mean'`` - average attentions from all layers.

        Returns
        -------
        array: attention
        """

        if isinstance(strings, list):
            if not isinstance(strings[0], str):
                raise ValueError('input must be a list of strings or a string')
        else:
            if not isinstance(strings, str):
                raise ValueError('input must be a list of strings or a string')
        if isinstance(strings, str):
            strings = [strings]

        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise Exception(
                "method not supported, only support 'last', 'first' and 'mean'"
            )
        attentions, s_tokens = self._attention(strings)

        if method == 'first':
            cls_attn = list(attentions[0].values())[0][:, :, 0, :]

        if method == 'last':
            cls_attn = list(attentions[-1].values())[0][:, :, 0, :]

        if method == 'mean':
            combined_attentions = []
            for a in attentions:
                combined_attentions.append(list(a.values())[0])
            cls_attn = np.mean(combined_attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        output = []
        for i in range(attn.shape[0]):
            if '[' in self._cls:
                output.append(
                    merge_wordpiece_tokens(list(zip(s_tokens[i], attn[i]))))
            else:
                output.append(
                    merge_sentencepiece_tokens(list(zip(s_tokens[i],
                                                        attn[i]))))
        return output
예제 #4
0
    def vectorize(self, string: str):
        """
        vectorize a string.

        Parameters
        ----------
        string: List[str]

        Returns
        -------
        result: np.array
        """
        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, [string])
        s_tokens = s_tokens[0]

        v = self._sess.run(
            self._vectorizer,
            feed_dict={
                self._X: input_ids,
                self._segment_ids: segment_ids,
                self._input_masks: input_masks,
            },
        )
        v = v[0]
        return merge_sentencepiece_tokens(
            list(zip(s_tokens, v[:len(s_tokens)])),
            weighted=False,
            vectorize=True,
            model='xlnet',
        )
예제 #5
0
    def vectorize(self, string: str):
        """
        vectorize a string.

        Parameters
        ----------
        string: List[str]

        Returns
        -------
        result: np.array
        """
        parsed_sequence, input_mask, bert_sequence = parse_bert_tagging(
            string, self._tokenizer)
        r = self._execute(
            inputs=[[parsed_sequence]],
            input_labels=['Placeholder'],
            output_labels=['vectorizer'],
        )
        v = r['vectorizer']
        v = v[0]
        return merge_sentencepiece_tokens(
            list(zip(bert_sequence, v[:len(bert_sequence)])),
            weighted=False,
            vectorize=True,
        )
예제 #6
0
파일: bert.py 프로젝트: samsonleegh/Malaya
    def vectorize(self,
                  strings: List[str],
                  labels: List[str],
                  method: str = 'first'):
        """
        vectorize a string.

        Parameters
        ----------
        strings: List[str]
        labels : List[str]
        method : str, optional (default='first')
            Vectorization layer supported. Allowed values:

            * ``'last'`` - vector from last sequence.
            * ``'first'`` - vector from first sequence.
            * ``'mean'`` - average vectors from all sequences.
            * ``'word'`` - average vectors based on tokens.


        Returns
        -------
        result: np.array
        """
        strings_left, strings_right, combined = [], [], []
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                combined.append((string, label))

        input_ids, input_masks, segment_ids, s_tokens = bert_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        v = self._sess.run(
            self._vectorizer,
            feed_dict={
                self._X: input_ids,
                self._segment_ids: segment_ids,
                self._input_masks: input_masks,
            },
        )
        if len(v.shape) == 2:
            v = v.reshape((*np.array(input_ids).shape, -1))

        if method == 'first':
            v = v[:, 0]
        elif method == 'last':
            v = v[:, -1]
        elif method == 'mean':
            v = np.mean(v, axis=1)
        else:
            v = [
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                    weighted=False,
                    vectorize=True,
                ) for i in range(len(v))
            ]
        return combined, v
예제 #7
0
파일: xlnet.py 프로젝트: lantip/Malaya
 def _vectorize(self, strings, method='first'):
     method = method.lower()
     if method not in ['first', 'last', 'mean', 'word']:
         raise ValueError(
             "method not supported, only support 'first', 'last', 'mean' and 'word'"
         )
     input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
         self._tokenizer, strings)
     r = self._execute(
         inputs=[input_ids, segment_ids, input_masks],
         input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
         output_labels=['vectorizer'],
     )
     v = r['vectorizer']
     if method == 'first':
         v = v[:, 0]
     elif method == 'last':
         v = v[:, -1]
     elif method == 'mean':
         v = np.mean(v, axis=1)
     else:
         v = [
             merge_sentencepiece_tokens(
                 list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                 weighted=False,
                 vectorize=True,
                 model='xlnet',
             ) for i in range(len(v))
         ]
     return v
예제 #8
0
파일: bert.py 프로젝트: samsonleegh/Malaya
    def vectorize(self, string: str):
        """
        vectorize a string.

        Parameters
        ----------
        string: List[str]

        Returns
        -------
        result: np.array
        """
        parsed_sequence, input_mask, bert_sequence = parse_bert_tagging(
            string, self._tokenizer)
        v = self._sess.run(
            self._vectorizer,
            feed_dict={
                self._X: [parsed_sequence],
                self._input_masks: [input_mask],
            },
        )
        v = v[0]
        return merge_sentencepiece_tokens(
            list(zip(bert_sequence, v[:len(bert_sequence)])),
            weighted=False,
            vectorize=True,
        )
예제 #9
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def vectorize(self,
                  strings: List[str],
                  labels: List[str],
                  method: str = 'first'):
        """
        vectorize a string.

        Parameters
        ----------
        strings: List[str]
        labels : List[str]
        method : str, optional (default='first')
            Vectorization layer supported. Allowed values:

            * ``'last'`` - vector from last sequence.
            * ``'first'`` - vector from first sequence.
            * ``'mean'`` - average vectors from all sequences.
            * ``'word'`` - average vectors based on tokens.


        Returns
        -------
        result: np.array
        """

        strings_left, strings_right, combined = [], [], []
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                combined.append((string, label))

        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['vectorizer'],
        )
        v = r['vectorizer']
        v = np.transpose(v, [1, 0, 2])

        if method == 'first':
            v = v[:, 0]
        elif method == 'last':
            v = v[:, -1]
        elif method == 'mean':
            v = np.mean(v, axis=1)
        else:
            v = [
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                    weighted=False,
                    vectorize=True,
                    model='xlnet',
                ) for i in range(len(v))
            ]
        return combined, v
예제 #10
0
    def vectorize(self, strings: List[str], method: str = 'first'):
        """
        vectorize list of strings.

        Parameters
        ----------
        strings: List[str]
        method : str, optional (default='first')
            Vectorization layer supported. Allowed values:

            * ``'last'`` - vector from last sequence.
            * ``'first'`` - vector from first sequence.
            * ``'mean'`` - average vectors from all sequences.
            * ``'word'`` - average vectors based on tokens.

        Returns
        -------
        result: np.array
        """
        method = method.lower()
        if method not in ['first', 'last', 'mean', 'word']:
            raise ValueError(
                "method not supported, only support 'first', 'last', 'mean' and 'word'"
            )

        tokenization = {'bert': bert_tokenization, 'xlnet': xlnet_tokenization}
        input_ids, input_masks, segment_ids, s_tokens = tokenization[
            self._mode](self._tokenizer, strings)
        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits_vectorize'],
        )
        v = r['logits_vectorize']

        if method == 'first':
            v = v[:, 0]
        elif method == 'last':
            v = v[:, -1]
        elif method == 'mean':
            v = np.mean(v, axis=1)
        else:
            v = [
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                    weighted=False,
                    vectorize=True,
                    model=self._mode,
                ) for i in range(len(v))
            ]
        return v
예제 #11
0
    def vectorize(self, strings: List[str], method: str = 'first'):
        """
        vectorize list of strings.

        Parameters
        ----------
        strings: List[str]
        method : str, optional (default='first')
            Vectorization layer supported. Allowed values:

            * ``'last'`` - vector from last sequence.
            * ``'first'`` - vector from first sequence.
            * ``'mean'`` - average vectors from all sequences.
            * ``'word'`` - average vectors based on tokens.

        Returns
        -------
        result: np.array
        """

        method = method.lower()
        if method not in ['first', 'last', 'mean', 'word']:
            raise ValueError(
                "method not supported, only support 'first', 'last', 'mean' and 'word'"
            )
        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, strings)
        v = self._sess.run(
            self._vectorizer,
            feed_dict={
                self._X: input_ids,
                self._segment_ids: segment_ids,
                self._input_masks: input_masks,
            },
        )
        if method == 'first':
            v = v[:, 0]
        elif method == 'last':
            v = v[:, -1]
        elif method == 'mean':
            v = np.mean(v, axis=1)
        else:
            v = [
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                    weighted=False,
                    vectorize=True,
                    model='xlnet',
                ) for i in range(len(v))
            ]
        return v
예제 #12
0
    def attention(self, strings: List[str], method: str = 'last', **kwargs):
        """
        Get attention string inputs.

        Parameters
        ----------
        strings : List[str]
        method : str, optional (default='last')
            Attention layer supported. Allowed values:

            * ``'last'`` - attention from last layer.
            * ``'first'`` - attention from first layer.
            * ``'mean'`` - average attentions from all layers.

        Returns
        -------
        result : List[List[Tuple[str, float]]]
        """

        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise Exception(
                "method not supported, only support ['last', 'first', 'mean']"
            )
        attentions, s_tokens, _ = self._attention(strings)

        if method == 'first':
            cls_attn = np.transpose(attentions[0][:, 0], (1, 0, 2))

        if method == 'last':
            cls_attn = np.transpose(attentions[-1][:, 0], (1, 0, 2))

        if method == 'mean':
            cls_attn = np.transpose(
                np.mean(attentions, axis = 0).mean(axis = 1), (1, 0, 2)
            )

        cls_attn = np.mean(cls_attn, axis = 1)
        total_weights = np.sum(cls_attn, axis = -1, keepdims = True)
        attn = cls_attn / total_weights
        output = []
        for i in range(attn.shape[0]):
            output.append(
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], attn[i])), model = 'xlnet'
                )
            )
        return output
예제 #13
0
파일: __init__.py 프로젝트: lkngin/Malaya
    def attention(self, strings: List[str], method: str = 'last', **kwargs):
        """
        Get attention string inputs from bert attention.

        Parameters
        ----------
        strings : str / list of str
        method : str, optional (default='last')
            Attention layer supported. Allowed values:

            * ``'last'`` - attention from last layer.
            * ``'first'`` - attention from first layer.
            * ``'mean'`` - average attentions from all layers.

        Returns
        -------
        result : List[List[Tuple[str, float]]]
        """

        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise Exception(
                "method not supported, only support 'last', 'first' and 'mean'"
            )
        attentions, s_tokens, _ = self._attention(strings)

        if method == 'first':
            cls_attn = list(attentions[0].values())[0][:, :, 0, :]

        if method == 'last':
            cls_attn = list(attentions[-1].values())[0][:, :, 0, :]

        if method == 'mean':
            combined_attentions = []
            for a in attentions:
                combined_attentions.append(list(a.values())[0])
            cls_attn = np.mean(combined_attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        output = []
        for i in range(attn.shape[0]):
            output.append(
                merge_sentencepiece_tokens(list(zip(s_tokens[i], attn[i]))))
        return output
예제 #14
0
파일: bert.py 프로젝트: samsonleegh/Malaya
 def _paraphrase(self, strings):
     batch_x, input_masks, input_segments, _ = bert_tokenization(
         self._tokenizer, strings)
     outputs = self._sess.run(
         self._logits,
         feed_dict={
             self._X: batch_x,
             self._segment_ids: input_segments,
             self._input_masks: input_masks,
         },
     )[:, 0, :].tolist()
     results = []
     for output in outputs:
         output = [i for i in output if i > 0]
         output = self._tokenizer.convert_ids_to_tokens(output)
         output = [(t, 1) for t in output]
         output = merge_sentencepiece_tokens(output)
         output = [t[0] for t in output]
         results.append(' '.join(output))
     return results
예제 #15
0
    def vectorize(self, string: str):
        """
        vectorize a string.

        Parameters
        ----------
        string: List[str]

        Returns
        -------
        result: np.array
        """
        s = string.split()
        sentences = [s]
        if self._mode == 'bert':
            f = constituency_bert
        elif self._mode == 'xlnet':
            f = constituency_xlnet
        else:
            raise ValueError(
                'mode not supported, only supported `bert` or `xlnet`')
        i, m, tokens = f(self._tokenizer, sentences)
        r = self._execute(
            inputs=[i, m],
            input_labels=['input_ids', 'word_end_mask'],
            output_labels=['vectorizer'],
        )
        v = r['vectorizer']
        if self._mode == 'bert':
            v = v[0]
        elif self._mode == 'xlnet':
            v = v[:, 0]

        return merge_sentencepiece_tokens(
            list(zip(tokens[0], v[:len(tokens[0])])),
            weighted=False,
            vectorize=True,
            model=self._mode,
        )
예제 #16
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def predict_words(self,
                      string: str,
                      method: str = 'last',
                      visualization: bool = True):
        """
        classify words.

        Parameters
        ----------
        string : str
        method : str, optional (default='last')
            Attention layer supported. Allowed values:

            * ``'last'`` - attention from last layer.
            * ``'first'`` - attention from first layer.
            * ``'mean'`` - average attentions from all layers.
        visualization: bool, optional (default=True)
            If True, it will open the visualization dashboard.

        Returns
        -------
        dictionary: results
        """

        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise ValueError(
                "method not supported, only support 'last', 'first' and 'mean'"
            )

        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, [string])
        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits', 'attention', 'logits_seq'],
        )
        result = sigmoid(r['logits'])
        words = sigmoid(r['logits_seq'])
        attentions = r['attention']
        if method == 'first':
            cls_attn = attentions[0][:, :, 0, :]

        if method == 'last':
            cls_attn = attentions[-1][:, :, 0, :]

        if method == 'mean':
            cls_attn = np.mean(attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        result = result[0]
        words = words[0]
        weights = []
        merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])),
                                            model='xlnet')
        for i in range(words.shape[1]):
            m = merge_sentencepiece_tokens(
                list(zip(s_tokens[0], words[:, i])),
                weighted=False,
                model='xlnet',
            )
            _, weight = zip(*m)
            weights.append(weight)
        w, a = zip(*merged)
        words = np.array(weights).T
        distribution_words = words[:, np.argmax(words.sum(axis=0))]
        y_histogram, x_histogram = np.histogram(distribution_words,
                                                bins=np.arange(0, 1, 0.05))
        y_histogram = y_histogram / y_histogram.sum()
        x_attention = np.arange(len(w))
        left, right = np.unique(np.argmax(words, axis=1), return_counts=True)
        left = left.tolist()
        y_barplot = []
        for i in range(len(self._label)):
            if i not in left:
                y_barplot.append(i)
            else:
                y_barplot.append(right[left.index(i)])

        dict_result = {self._label[i]: result[i] for i in range(len(result))}
        dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)}
        dict_result['word'] = {w: words[no] for no, w in enumerate(w)}
        dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram}
        dict_result['attention'] = {'x': x_attention, 'y': np.array(a)}
        dict_result['barplot'] = {'x': self._label, 'y': y_barplot}
        dict_result['module'] = self._module
        if visualization:
            _render_toxic(dict_result)
        else:
            return dict_result
예제 #17
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def _predict_words(self, string, method, visualization, add_neutral=False):
        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise ValueError(
                "method not supported, only support 'last', 'first' and 'mean'"
            )
        if add_neutral:
            label = self._label + ['neutral']
        else:
            label = self._label

        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, [string])
        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits', 'attention', 'logits_seq'],
        )
        result = softmax(r['logits'], axis=-1)
        words = softmax(r['logits_seq'], axis=-1)
        attentions = r['attention']

        if method == 'first':
            cls_attn = attentions[0][:, :, 0, :]

        if method == 'last':
            cls_attn = attentions[-1][:, :, 0, :]

        if method == 'mean':
            cls_attn = np.mean(attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        words = words[0]

        if add_neutral:
            result = neutral(result)
            words = neutral(words)

        result = result[0]
        weights = []
        merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])),
                                            model='xlnet')
        for i in range(words.shape[1]):
            m = merge_sentencepiece_tokens(
                list(zip(s_tokens[0], words[:, i])),
                weighted=False,
                model='xlnet',
            )
            _, weight = zip(*m)
            weights.append(weight)
        w, a = zip(*merged)
        words = np.array(weights).T
        distribution_words = words[:, np.argmax(words.sum(axis=0))]
        y_histogram, x_histogram = np.histogram(distribution_words,
                                                bins=np.arange(0, 1, 0.05))
        y_histogram = y_histogram / y_histogram.sum()
        x_attention = np.arange(len(w))
        left, right = np.unique(np.argmax(words, axis=1), return_counts=True)
        left = left.tolist()
        y_barplot = []
        for i in range(len(label)):
            if i not in left:
                y_barplot.append(i)
            else:
                y_barplot.append(right[left.index(i)])

        dict_result = {label[i]: result[i] for i in range(len(result))}
        dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)}
        dict_result['word'] = {w: words[no] for no, w in enumerate(w)}
        dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram}
        dict_result['attention'] = {'x': x_attention, 'y': np.array(a)}
        dict_result['barplot'] = {'x': label, 'y': y_barplot}
        dict_result['module'] = self._module

        if visualization:
            render_dict[self._module](dict_result)
        else:
            return dict_result
예제 #18
0
파일: bert.py 프로젝트: samsonleegh/Malaya
    def _predict_words(self, string, method, visualization, add_neutral=False):
        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise ValueError(
                "method not supported, only support 'last', 'first' and 'mean'"
            )
        if add_neutral:
            label = self._label + ['neutral']
        else:
            label = self._label

        batch_x, batch_mask, _, s_tokens = bert_tokenization(
            self._tokenizer, [string])
        result, attentions, words = self._sess.run(
            [self._softmax, self._attns, self._softmax_seq],
            feed_dict={
                self._X: batch_x,
                self._input_masks: batch_mask
            },
        )
        if method == 'first':
            cls_attn = list(attentions[0].values())[0][:, :, 0, :]

        if method == 'last':
            cls_attn = list(attentions[-1].values())[0][:, :, 0, :]

        if method == 'mean':
            combined_attentions = []
            for a in attentions:
                combined_attentions.append(list(a.values())[0])
            cls_attn = np.mean(combined_attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        words = words[0]

        if add_neutral:
            result = neutral(result)
            words = neutral(words)

        result = result[0]
        weights = []
        merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])))
        for i in range(words.shape[1]):
            m = merge_sentencepiece_tokens(list(zip(s_tokens[0], words[:, i])),
                                           weighted=False)
            _, weight = zip(*m)
            weights.append(weight)
        w, a = zip(*merged)
        words = np.array(weights).T
        distribution_words = words[:, np.argmax(words.sum(axis=0))]
        y_histogram, x_histogram = np.histogram(distribution_words,
                                                bins=np.arange(0, 1, 0.05))
        y_histogram = y_histogram / y_histogram.sum()
        x_attention = np.arange(len(w))
        left, right = np.unique(np.argmax(words, axis=1), return_counts=True)
        left = left.tolist()
        y_barplot = []
        for i in range(len(label)):
            if i not in left:
                y_barplot.append(i)
            else:
                y_barplot.append(right[left.index(i)])

        dict_result = {label[i]: result[i] for i in range(len(result))}
        dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)}
        dict_result['word'] = {w: words[no] for no, w in enumerate(w)}
        dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram}
        dict_result['attention'] = {'x': x_attention, 'y': np.array(a)}
        dict_result['barplot'] = {'x': label, 'y': y_barplot}
        dict_result['class_name'] = self._class_name

        if visualization:
            render_dict[self._class_name](dict_result)
        else:
            return dict_result
예제 #19
0
파일: bert.py 프로젝트: samsonleegh/Malaya
    def predict_words(self,
                      string: str,
                      method: str = 'last',
                      visualization: bool = True):
        """
        classify words.

        Parameters
        ----------
        string : str
        method : str, optional (default='last')
            Attention layer supported. Allowed values:

            * ``'last'`` - attention from last layer.
            * ``'first'`` - attention from first layer.
            * ``'mean'`` - average attentions from all layers.
        visualization: bool, optional (default=True)
            If True, it will open the visualization dashboard.

        Returns
        -------
        dictionary: results
        """

        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise ValueError(
                "method not supported, only support 'last', 'first' and 'mean'"
            )

        batch_x, input_masks, _, s_tokens = bert_tokenization(
            self._tokenizer, [string])
        result, attentions, words = self._sess.run(
            [self._sigmoid, self._attns, self._sigmoid_seq],
            feed_dict={
                self._X: batch_x,
                self._input_masks: input_masks
            },
        )
        if method == 'first':
            cls_attn = list(attentions[0].values())[0][:, :, 0, :]

        if method == 'last':
            cls_attn = list(attentions[-1].values())[0][:, :, 0, :]

        if method == 'mean':
            combined_attentions = []
            for a in attentions:
                combined_attentions.append(list(a.values())[0])
            cls_attn = np.mean(combined_attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        result = result[0]
        words = words[0]
        weights = []
        merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])))
        for i in range(words.shape[1]):
            m = merge_sentencepiece_tokens(list(zip(s_tokens[0], words[:, i])),
                                           weighted=False)
            _, weight = zip(*m)
            weights.append(weight)
        w, a = zip(*merged)
        words = np.array(weights).T
        distribution_words = words[:, np.argmax(words.sum(axis=0))]
        y_histogram, x_histogram = np.histogram(distribution_words,
                                                bins=np.arange(0, 1, 0.05))
        y_histogram = y_histogram / y_histogram.sum()
        x_attention = np.arange(len(w))
        left, right = np.unique(np.argmax(words, axis=1), return_counts=True)
        left = left.tolist()
        y_barplot = []
        for i in range(len(self._label)):
            if i not in left:
                y_barplot.append(i)
            else:
                y_barplot.append(right[left.index(i)])

        dict_result = {self._label[i]: result[i] for i in range(len(result))}
        dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)}
        dict_result['word'] = {w: words[no] for no, w in enumerate(w)}
        dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram}
        dict_result['attention'] = {'x': x_attention, 'y': np.array(a)}
        dict_result['barplot'] = {'x': self._label, 'y': y_barplot}
        dict_result['class_name'] = self._class_name
        if visualization:
            _render_toxic(dict_result)
        else:
            return dict_result