示例#1
0
文件: xlnet.py 项目: lantip/Malaya
    def _base(self, strings, labels):

        strings_left, strings_right, mapping = [], [], defaultdict(list)
        index = 0
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                mapping[no].append(index)
                index += 1

        input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits'],
        )
        output = softmax(r['logits'], axis=-1)

        results = []
        for k, v in mapping.items():
            result = {}
            for no, index in enumerate(v):
                result[labels[no]] = output[index, 1]
            results.append(result)
        return results
示例#2
0
    def _base(self, strings, labels):

        strings_left, strings_right, mapping = [], [], defaultdict(list)
        index = 0
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                mapping[no].append(index)
                index += 1

        input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        output = self._sess.run(
            self._softmax,
            feed_dict={
                self._X: input_ids,
                self._segment_ids: segment_ids,
                self._input_masks: input_masks,
            },
        )

        results = []
        for k, v in mapping.items():
            result = {}
            for no, index in enumerate(v):
                result[labels[no]] = output[index, 1]
            results.append(result)
        return results
示例#3
0
文件: xlnet.py 项目: lantip/Malaya
    def vectorize(self,
                  strings: List[str],
                  labels: List[str],
                  method: str = 'first'):
        """
        vectorize a string.

        Parameters
        ----------
        strings: List[str]
        labels : List[str]
        method : str, optional (default='first')
            Vectorization layer supported. Allowed values:

            * ``'last'`` - vector from last sequence.
            * ``'first'`` - vector from first sequence.
            * ``'mean'`` - average vectors from all sequences.
            * ``'word'`` - average vectors based on tokens.


        Returns
        -------
        result: np.array
        """

        strings_left, strings_right, combined = [], [], []
        for no, string in enumerate(strings):
            for label in labels:
                strings_left.append(string)
                strings_right.append(f'teks ini adalah mengenai {label}')
                combined.append((string, label))

        input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['vectorizer'],
        )
        v = r['vectorizer']
        v = np.transpose(v, [1, 0, 2])

        if method == 'first':
            v = v[:, 0]
        elif method == 'last':
            v = v[:, -1]
        elif method == 'mean':
            v = np.mean(v, axis=1)
        else:
            v = [
                merge_sentencepiece_tokens(
                    list(zip(s_tokens[i], v[i][:len(s_tokens[i])])),
                    weighted=False,
                    vectorize=True,
                    model='xlnet',
                ) for i in range(len(v))
            ]
        return combined, v
示例#4
0
文件: xlnet.py 项目: lantip/Malaya
    def _base(self, strings_left, strings_right):
        input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        r = self._execute(
            inputs=[input_ids, segment_ids, input_masks],
            input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'],
            output_labels=['logits'],
        )
        return softmax(r['logits'], axis=-1)
示例#5
0
    def _base(self, strings_left, strings_right):
        input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
            self._tokenizer, strings_left, strings_right)

        return self._sess.run(
            self._softmax,
            feed_dict={
                self._X: input_ids,
                self._segment_ids: segment_ids,
                self._input_masks: input_masks,
            },
        )