def _base(self, strings_left, strings_right): input_ids_left, input_masks_left, segment_ids_left, _ = xlnet_tokenization( self._tokenizer, strings_left) input_ids_right, input_masks_right, segment_ids_right, _ = xlnet_tokenization( self._tokenizer, strings_left) r = self._execute( inputs=[ input_ids_left, segment_ids_left, input_masks_left, input_ids_right, input_masks_right, segment_ids_right, ], input_labels=[ 'Placeholder', 'Placeholder_1', 'Placeholder_2', 'Placeholder_3', 'Placeholder_4', 'Placeholder_5', ], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _base(self, strings, labels): strings_left, strings_right, mapping = [], [], defaultdict(list) index = 0 for no, string in enumerate(strings): for label in labels: strings_left.append(string) strings_right.append(f'teks ini adalah mengenai {label}') mapping[no].append(index) index += 1 input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) output = softmax(r['logits'], axis=-1) results = [] for k, v in mapping.items(): result = {} for no, index in enumerate(v): result[labels[no]] = output[index, 1] results.append(result) return results
def _classify(self, strings): input_ids, input_masks, segment_ids, _ = xlnet_tokenization( self._tokenizer, strings) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _base(self, strings_left, strings_right): input_ids, input_masks, segment_ids, _ = bert_tokenization_siamese( self._tokenizer, strings_left, strings_right) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _classify(self, strings): input_ids, _, _, _ = bert_tokenization(self._tokenizer, strings) input_ids = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding='post', maxlen=self._maxlen) r = self._execute( inputs=[input_ids], input_labels=['Placeholder'], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _classify(self, strings): input_ids, input_masks, _, _ = bert_tokenization( self._tokenizer, strings ) r = self._execute( inputs=[input_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1'], output_labels=['logits'], ) if self._multilabels: return sigmoid(r['logits']) else: return softmax(r['logits'], axis=-1)
def _classify(self, strings): strings = [language_detection_textcleaning(i) for i in strings] subs = [ ' '.join(s) for s in self._bpe.bpe.encode(strings, output_type=self._bpe.mode) ] transformed = self._vectorizer.transform(subs) batch_x = _convert_sparse_matrix_to_sparse_tensor(transformed) r = self._execute( inputs=batch_x, input_labels=[ 'X_Placeholder/shape', 'X_Placeholder/values', 'X_Placeholder/indices', 'W_Placeholder/shape', 'W_Placeholder/values', 'W_Placeholder/indices', ], output_labels=['logits'], ) probs = softmax(r['logits'], axis=-1) return probs
def _predict_words(self, string, method, visualization, add_neutral=False): method = method.lower() if method not in ['last', 'first', 'mean']: raise ValueError( "method not supported, only support 'last', 'first' and 'mean'" ) if add_neutral: label = self._label + ['neutral'] else: label = self._label input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'attention', 'logits_seq'], ) result = softmax(r['logits'], axis=-1) words = softmax(r['logits_seq'], axis=-1) attentions = r['attention'] if method == 'first': cls_attn = attentions[0][:, :, 0, :] if method == 'last': cls_attn = attentions[-1][:, :, 0, :] if method == 'mean': cls_attn = np.mean(attentions, axis=0).mean(axis=2) cls_attn = np.mean(cls_attn, axis=1) total_weights = np.sum(cls_attn, axis=-1, keepdims=True) attn = cls_attn / total_weights words = words[0] if add_neutral: result = neutral(result) words = neutral(words) result = result[0] weights = [] merged = merge_sentencepiece_tokens(list(zip(s_tokens[0], attn[0])), model='xlnet') for i in range(words.shape[1]): m = merge_sentencepiece_tokens( list(zip(s_tokens[0], words[:, i])), weighted=False, model='xlnet', ) _, weight = zip(*m) weights.append(weight) w, a = zip(*merged) words = np.array(weights).T distribution_words = words[:, np.argmax(words.sum(axis=0))] y_histogram, x_histogram = np.histogram(distribution_words, bins=np.arange(0, 1, 0.05)) y_histogram = y_histogram / y_histogram.sum() x_attention = np.arange(len(w)) left, right = np.unique(np.argmax(words, axis=1), return_counts=True) left = left.tolist() y_barplot = [] for i in range(len(label)): if i not in left: y_barplot.append(i) else: y_barplot.append(right[left.index(i)]) dict_result = {label[i]: result[i] for i in range(len(result))} dict_result['alphas'] = {w: a[no] for no, w in enumerate(w)} dict_result['word'] = {w: words[no] for no, w in enumerate(w)} dict_result['histogram'] = {'x': x_histogram, 'y': y_histogram} dict_result['attention'] = {'x': x_attention, 'y': np.array(a)} dict_result['barplot'] = {'x': label, 'y': y_barplot} dict_result['module'] = self._module if visualization: render_dict[self._module](dict_result) else: return dict_result