예제 #1
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def _base(self, strings_left, strings_right):
        input_ids_left, input_masks_left, segment_ids_left, _ = xlnet_tokenization(
            self._tokenizer, strings_left)
        input_ids_right, input_masks_right, segment_ids_right, _ = xlnet_tokenization(
            self._tokenizer, strings_left)

        r = self._execute(
            inputs=[
                input_ids_left,
                segment_ids_left,
                input_masks_left,
                input_ids_right,
                input_masks_right,
                segment_ids_right,
            ],
            input_labels=[
                'Placeholder',
                'Placeholder_1',
                'Placeholder_2',
                'Placeholder_3',
                'Placeholder_4',
                'Placeholder_5',
            ],
            output_labels=['logits'],
        )
        return softmax(r['logits'], axis=-1)
예제 #2
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def _base(self, strings, labels):

        strings_left, strings_right, mapping = [], [], defaultdict(list)
        index = 0
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                mapping[no].append(index)
                index += 1

        input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits'],
        )
        output = softmax(r['logits'], axis=-1)

        results = []
        for k, v in mapping.items():
            result = {}
            for no, index in enumerate(v):
                result[labels[no]] = output[index, 1]
            results.append(result)
        return results
예제 #3
0
파일: xlnet.py 프로젝트: lantip/Malaya
 def _classify(self, strings):
     input_ids, input_masks, segment_ids, _ = xlnet_tokenization(
         self._tokenizer, strings)
     r = self._execute(
         inputs=[input_ids, segment_ids, input_masks],
         input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
         output_labels=['logits'],
     )
     return softmax(r['logits'], axis=-1)
예제 #4
0
 def _base(self, strings_left, strings_right):
     input_ids, input_masks, segment_ids, _ = bert_tokenization_siamese(
         self._tokenizer, strings_left, strings_right)
     r = self._execute(
         inputs=[input_ids, segment_ids, input_masks],
         input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
         output_labels=['logits'],
     )
     return softmax(r['logits'], axis=-1)
예제 #5
0
파일: bigbird.py 프로젝트: MuzyAce/malaya
 def _classify(self, strings):
     input_ids, _, _, _ = bert_tokenization(self._tokenizer, strings)
     input_ids = tf.keras.preprocessing.sequence.pad_sequences(
         input_ids, padding='post', maxlen=self._maxlen)
     r = self._execute(
         inputs=[input_ids],
         input_labels=['Placeholder'],
         output_labels=['logits'],
     )
     return softmax(r['logits'], axis=-1)
예제 #6
0
 def _classify(self, strings):
     input_ids, input_masks, _, _ = bert_tokenization(
         self._tokenizer, strings
     )
     r = self._execute(
         inputs=[input_ids, input_masks],
         input_labels=['Placeholder', 'Placeholder_1'],
         output_labels=['logits'],
     )
     if self._multilabels:
         return sigmoid(r['logits'])
     else:
         return softmax(r['logits'], axis=-1)
예제 #7
0
 def _classify(self, strings):
     strings = [language_detection_textcleaning(i) for i in strings]
     subs = [
         ' '.join(s)
         for s in self._bpe.bpe.encode(strings, output_type=self._bpe.mode)
     ]
     transformed = self._vectorizer.transform(subs)
     batch_x = _convert_sparse_matrix_to_sparse_tensor(transformed)
     r = self._execute(
         inputs=batch_x,
         input_labels=[
             'X_Placeholder/shape',
             'X_Placeholder/values',
             'X_Placeholder/indices',
             'W_Placeholder/shape',
             'W_Placeholder/values',
             'W_Placeholder/indices',
         ],
         output_labels=['logits'],
     )
     probs = softmax(r['logits'], axis=-1)
     return probs
예제 #8
0
파일: xlnet.py 프로젝트: lantip/Malaya
    def _predict_words(self, string, method, visualization, add_neutral=False):
        method = method.lower()
        if method not in ['last', 'first', 'mean']:
            raise ValueError(
                "method not supported, only support 'last', 'first' and 'mean'"
            )
        if add_neutral:
            label = self._label + ['neutral']
        else:
            label = self._label

        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization(
            self._tokenizer, [string])
        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits', 'attention', 'logits_seq'],
        )
        result = softmax(r['logits'], axis=-1)
        words = softmax(r['logits_seq'], axis=-1)
        attentions = r['attention']

        if method == 'first':
            cls_attn = attentions[0][:, :, 0, :]

        if method == 'last':
            cls_attn = attentions[-1][:, :, 0, :]

        if method == 'mean':
            cls_attn = np.mean(attentions, axis=0).mean(axis=2)

        cls_attn = np.mean(cls_attn, axis=1)
        total_weights = np.sum(cls_attn, axis=-1, keepdims=True)
        attn = cls_attn / total_weights
        words = words[0]

        if add_neutral:
            result = neutral(result)
            words = neutral(words)

        result = result[0]
        weights = []
        merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])),
                                            model='xlnet')
        for i in range(words.shape[1]):
            m = merge_sentencepiece_tokens(
                list(zip(s_tokens[0], words[:, i])),
                weighted=False,
                model='xlnet',
            )
            _, weight = zip(*m)
            weights.append(weight)
        w, a = zip(*merged)
        words = np.array(weights).T
        distribution_words = words[:, np.argmax(words.sum(axis=0))]
        y_histogram, x_histogram = np.histogram(distribution_words,
                                                bins=np.arange(0, 1, 0.05))
        y_histogram = y_histogram / y_histogram.sum()
        x_attention = np.arange(len(w))
        left, right = np.unique(np.argmax(words, axis=1), return_counts=True)
        left = left.tolist()
        y_barplot = []
        for i in range(len(label)):
            if i not in left:
                y_barplot.append(i)
            else:
                y_barplot.append(right[left.index(i)])

        dict_result = {label[i]: result[i] for i in range(len(result))}
        dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)}
        dict_result['word'] = {w: words[no] for no, w in enumerate(w)}
        dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram}
        dict_result['attention'] = {'x': x_attention, 'y': np.array(a)}
        dict_result['barplot'] = {'x': label, 'y': y_barplot}
        dict_result['module'] = self._module

        if visualization:
            render_dict[self._module](dict_result)
        else:
            return dict_result