def _base(self, strings, labels): strings_left, strings_right, mapping = [], [], defaultdict(list) index = 0 for no, string in enumerate(strings): for label in labels: strings_left.append(string) strings_right.append(f'teks ini adalah mengenai {label}') mapping[no].append(index) index += 1 input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) output = softmax(r['logits'], axis=-1) results = [] for k, v in mapping.items(): result = {} for no, index in enumerate(v): result[labels[no]] = output[index, 1] results.append(result) return results
def _base(self, strings, labels): strings_left, strings_right, mapping = [], [], defaultdict(list) index = 0 for no, string in enumerate(strings): for label in labels: strings_left.append(string) strings_right.append(f'teks ini adalah mengenai {label}') mapping[no].append(index) index += 1 input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) output = self._sess.run( self._softmax, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) results = [] for k, v in mapping.items(): result = {} for no, index in enumerate(v): result[labels[no]] = output[index, 1] results.append(result) return results
def vectorize(self, strings: List[str], labels: List[str], method: str = 'first'): """ vectorize a string. Parameters ---------- strings: List[str] labels : List[str] method : str, optional (default='first') Vectorization layer supported. Allowed values: * ``'last'`` - vector from last sequence. * ``'first'`` - vector from first sequence. * ``'mean'`` - average vectors from all sequences. * ``'word'`` - average vectors based on tokens. Returns ------- result: np.array """ strings_left, strings_right, combined = [], [], [] for no, string in enumerate(strings): for label in labels: strings_left.append(string) strings_right.append(f'teks ini adalah mengenai {label}') combined.append((string, label)) input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['vectorizer'], ) v = r['vectorizer'] v = np.transpose(v, [1, 0, 2]) if method == 'first': v = v[:, 0] elif method == 'last': v = v[:, -1] elif method == 'mean': v = np.mean(v, axis=1) else: v = [ merge_sentencepiece_tokens( list(zip(s_tokens[i], v[i][:len(s_tokens[i])])), weighted=False, vectorize=True, model='xlnet', ) for i in range(len(v)) ] return combined, v
def _base(self, strings_left, strings_right): input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits'], ) return softmax(r['logits'], axis=-1)
def _base(self, strings_left, strings_right): input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese( self._tokenizer, strings_left, strings_right) return self._sess.run( self._softmax, feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, )