예제 #1
0
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, "pipeline"):
            raise ValueError("You have to load model first.")

        sentence_list = content
        # 1. 检查sentence的类型
        for sentence in sentence_list:
            if not all((type(obj) == str for obj in sentence)):
                raise ValueError("Input must be list of list of string.")

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field("words", sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        def merge_tag(words_list, tags_list):
            rtn = []
            for words, tags in zip(words_list, tags_list):
                rtn.append([w + "/" + t for w, t in zip(words, tags)])
            return rtn

        output = dataset.field_arrays["tag"].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return merge_tag(content, output)
예제 #2
0
 def test_delete_field(self):
     dd = DataSet()
     dd.add_field("x", [[1, 2, 3]] * 10)
     dd.add_field("y", [[1, 2, 3, 4]] * 10)
     dd.delete_field("x")
     self.assertFalse("x" in dd.field_arrays)
     self.assertTrue("y" in dd.field_arrays)
예제 #3
0
 def test_add_field(self):
     ds = DataSet({"x": [3, 4]})
     ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']],
                  is_input=True,
                  is_target=True)
     # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
     print(ds)
예제 #4
0
    def predict(self, content):
        """
        分词接口。

        :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如
            [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。
        :return: str或List[str], 根据输入的的类型决定。
        """
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('raw_sentence', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset.get_field('output').content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
예제 #5
0
    def predict(self, content):
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        # 1. 利用POS得到分词和pos tagging结果
        pos_out = self.pos_tagger.predict(content)
        # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()]

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('wp', pos_out)
        dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']],
                      new_field_name='words')
        dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']],
                      new_field_name='pos')
        dataset.rename_field("words", "raw_words")

        # 3. 使用pipeline
        self.pipeline(dataset)
        dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']],
                      new_field_name='arc_pred')
        dataset.apply(lambda x: [
            arc + '/' + label
            for arc, label in zip(x['arc_pred'], x['label_pred_seq'])
        ][1:],
                      new_field_name='output')
        # output like: [['2/top', '0/root', '4/nn', '2/dep']]
        return dataset.field_arrays['output'].content
예제 #6
0
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('words', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset['word_pos_output'].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
예제 #7
0
파일: api.py 프로젝트: huziye/fastNLP_fork
    def predict(self, content):
        """

        :param content: list of list of str. Each string is a token(word).
        :return answer: list of list of str. Each string is a tag.
        """
        if not hasattr(self, "pipeline"):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field("words", sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        def decode_tags(ins):
            pred_tags = ins["tag"]
            chars = ins["words"]
            words = []
            start_idx = 0
            for idx, tag in enumerate(pred_tags):
                if tag[0] == "S":
                    words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
                    start_idx = idx + 1
                elif tag[0] == "E":
                    words.append("".join(chars[start_idx:idx + 1]) + "/" +
                                 tag[2:])
                    start_idx = idx + 1
            return words

        dataset.apply(decode_tags, new_field_name="tag_output")

        output = dataset.field_arrays["tag_output"].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output
예제 #8
0
    def test_add_append(self):
        dd = DataSet()
        dd.add_field("x", [[1, 2, 3]] * 10)
        dd.add_field("y", [[1, 2, 3, 4]] * 10)
        dd.add_field("z", [[5, 6]] * 10)
        self.assertEqual(len(dd), 10)
        self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10)
        self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10)
        self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10)

        with self.assertRaises(RuntimeError):
            dd.add_field("??", [[1, 2]] * 40)
예제 #9
0
    def predict(self, content):
        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('words', sentence_list)
        # dataset.add_field('tag', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)
        for ins in dataset:
            ins['heads'] = ins['heads'].tolist()

        return dataset['heads'], dataset['labels']
예제 #10
0
 def test_add_append(self):
     dd = DataSet()
     dd.add_field("x", [[1, 2, 3]] * 10)
     dd.add_field("y", [[1, 2, 3, 4]] * 10)
     dd.add_field("z", [[5, 6]] * 10)
     self.assertEqual(len(dd), 10)
     self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10)
     self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10)
     self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10)
예제 #11
0
    def predict(self, content):

        if not hasattr(self, 'pipeline'):
            raise ValueError("You have to load model first.")

        sentence_list = []
        # 1. 检查sentence的类型
        if isinstance(content, str):
            sentence_list.append(content)
        elif isinstance(content, list):
            sentence_list = content

        # 2. 组建dataset
        dataset = DataSet()
        dataset.add_field('raw_sentence', sentence_list)

        # 3. 使用pipeline
        self.pipeline(dataset)

        output = dataset['output'].content
        if isinstance(content, str):
            return output[0]
        elif isinstance(content, list):
            return output