Python tokenize 예제들, smtag.common.utils.tokenize Python 예제들

예제 #1

0

파일 보기

    def test_serializer_1(self):
        '''
        Simple test to tag 2 words.
        '''
        input_string = 'A gene or protein.'
        prediction = torch.Tensor([
            [  #A         g    e    n    e         o    r         p    r    o    t    e    i    n    .
                [
                    0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99, 0
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Decoder([input_string], prediction,
                    Catalogue.from_list(
                        ['gene', 'small_molecule', 'tissue', 'protein']))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        serializer = Serializer(tag="sd-tag", format="xml")
        predicted_xml_string = serializer.serialize(b)[0]
        expected_xml_string = '<smtag>A <sd-tag type="gene" type_score="99">gene</sd-tag> or <sd-tag type="protein" type_score="99">protein</sd-tag>.</smtag>'
        #expected_html_string = 'A <span class="sd-tag gene">gene</span> or <span class="sd-tag protein">protein</span>.'
        #print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)

예제 #2

0

파일 보기

 def test_detokenize(self):
     for example in self.examples:
         token_list = tokenize(example[0])['token_list']
         humpty_dumpty = ''.join(
             ["{}{}".format(t.left_spacer, t.text) for t in token_list])
         #print(humpty_dumpty)
         self.assertEqual(example[0], humpty_dumpty)

예제 #3

0

파일 보기

    def test_serializer_updatexml(self):
        '''
        Test the update of a pretagged xml object
        '''
        xml_string = '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag type="protein">others</sd-tag></sd-panel>'
        xml = fromstring(xml_string)
        expected_xml_string = tostring(
            fromstring(
                '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag role="intervention" type="protein" role_score="99">others</sd-tag></sd-panel>'
            ))
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Binarized([input_string], prediction,
                      Catalogue.from_list(['intervention']))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()
        updatexml_(xml, b)
        resulting_xml_string = tostring(xml)
        print(resulting_xml_string)
        self.assertEqual(expected_xml_string, resulting_xml_string)

예제 #4

0

파일 보기

    def test_serializer_3(self):
        '''
        Testing tagging with staggered features and xml escaping.
        '''
        input_string = 'A gene or oth>rs'
        prediction = torch.Tensor([
            [  #A         g    e    n    e         o    r         o    t    h    >    r    s
                [
                    0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0.99, 0.99, 0, 0, 0, 0, 0,
                    0, 0
                ], [0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99,
                    0.99
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99,
                    0.99
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Binarized([input_string], prediction,
                      Catalogue.from_list(
                          ['geneprod', 'assayed', 'intervention', 'protein']))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()
        for _ in range(100):
            serializer = Serializer(tag="sd-tag", format="xml")
            predicted_xml_string = serializer.serialize(b)[0]
        expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">gene</sd-tag> <sd-tag type="geneprod" role="assayed" type_score="99" role_score="99">or</sd-tag> <sd-tag role="intervention" type="protein" role_score="99" type_score="99">oth&gt;rs</sd-tag></smtag>'
        print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)

예제 #5

0

파일 보기

파일: test_binarize.py 프로젝트: source-data/py-smtag

    def test_fuse_adjascent_2(self):
        '''
        Testing the fusion of two terms at the end of the string.
        '''
        input_string = 'A ge n'
        prediction = torch.Tensor([[  #A         g    e         n    
            [0, 0, 0.99, 0.99, 0.6, 0.99]
        ]])
        expected_start = torch.Tensor([[[0., 0., 1., 0., 0., 0.]]])
        expected_stop = torch.Tensor([[[0., 0., 0., 0., 0., 1.]]])
        expected_marks = torch.Tensor([[[0., 0., 1., 1., 1., 1.]]])

        b = Binarized([input_string], prediction, [Catalogue.GENEPROD])
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()

        print("\n fuse at the end")
        print("".join(
            [str(int(x)) for x in list(b.start.view(b.start.numel()))]))
        print("".join([str(int(x))
                       for x in list(b.stop.view(b.stop.numel()))]))
        print("".join(
            [str(int(x)) for x in list(b.marks.view(b.marks.numel()))]))
        print(",".join(
            [str(int(x)) for x in list(b.score.view(b.marks.numel()))]))

        self.assertTensorEqual(expected_start, b.start)
        self.assertTensorEqual(expected_stop, b.stop)
        self.assertTensorEqual(expected_marks, b.marks)

예제 #6

0

파일 보기

파일: test_binarize.py 프로젝트: source-data/py-smtag

    def test_binarize(self):
        '''
        Testing the binarization without fusion.
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    e    r    s
                [
                    0, 0, 0.99, 0.99, 0, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ]
            ]
        ])
        expected_start = torch.Tensor([[[
            0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0.
        ]]])
        expected_stop = torch.Tensor([[[
            0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0.
        ]]])
        expected_marks = torch.Tensor([[[
            0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0.
        ]]])

        b = Binarized([input_string], prediction, [Catalogue.GENEPROD])
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])

        print("\n")
        print("".join(
            [str(int(x)) for x in list(b.start.view(b.start.numel()))]))
        print("".join([str(int(x))
                       for x in list(b.stop.view(b.stop.numel()))]))
        print("".join(
            [str(int(x)) for x in list(b.marks.view(b.marks.numel()))]))
        print(",".join(
            [str(int(x)) for x in list(b.score.view(b.marks.numel()))]))
        self.assertTensorEqual(expected_start, b.start)
        self.assertTensorEqual(expected_stop, b.stop)
        self.assertTensorEqual(expected_marks, b.marks)

예제 #7

0

파일 보기

    def test_serializer_4(self):
        '''
        Testing tagging of ambiguous predictions "others" as both intervention and assayed (with lower score)
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [
                    0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.98, 0.98, 0.98, 0.98,
                    0.98, 0.98
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Binarized([input_string], prediction,
                      Catalogue.from_list([
                          'geneprod', 'small_molecule', 'intervention',
                          'assayed'
                      ]))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()
        serializer = Serializer(tag="sd-tag", format="xml")
        predicted_xml_string = serializer.serialize(b)[0]
        expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">ge ne</sd-tag> or <sd-tag type="small_molecule" role="intervention" type_score="99" role_score="99">others</sd-tag></smtag>'
        #print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)

예제 #8

0

파일 보기

파일: test_binarize.py 프로젝트: source-data/py-smtag

    def test_fuse_adjascent_1(self):
        '''
        Testing the fusion between two similarly labeled terms separated by a tab.
        '''
        input_string = 'A\tge\tne\tor\tothers'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [
                    0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ]
            ]
        ])
        expected_start = torch.Tensor([[[
            0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
        ]]])
        expected_stop = torch.Tensor([[[
            0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
        ]]])
        expected_marks = torch.Tensor([[[
            0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.
        ]]])

        b = Binarized([input_string], prediction, [Catalogue.GENEPROD])
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()
        print("\nFuse with spacer")
        print("".join(
            [str(int(x)) for x in list(b.start.view(b.start.numel()))]))
        print("".join([str(int(x))
                       for x in list(b.stop.view(b.stop.numel()))]))
        print("".join(
            [str(int(x)) for x in list(b.marks.view(b.marks.numel()))]))
        print(",".join(
            [str(int(x)) for x in list(b.score.view(b.marks.numel()))]))

        self.assertTensorEqual(expected_start, b.start)
        self.assertTensorEqual(expected_stop, b.stop)
        self.assertTensorEqual(expected_marks, b.marks)

예제 #9

0

파일 보기

    def test_serializer_2(self):
        '''
        Testing tagging of multiple token ("ge ne" as type="gene")
        and multiple attributes for one terms ("others" as role="intervention" type="protein")
        '''
        input_string = 'A ge ne or others'
        prediction = torch.Tensor([
            [  #A         g    e         n    e         o    r         o    t    h    e    r    s
                [
                    0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0
                ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99
                ],
                [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99,
                    0.99, 0.99
                ]
            ]
        ])

        #self, text_examples, prediction, output_semantics
        b = Binarized([input_string], prediction,
                      Catalogue.from_list([
                          'geneprod', 'small_molecule', 'intervention',
                          'protein'
                      ]))
        token_list = tokenize(input_string)
        b.binarize_with_token([token_list])
        b.fuse_adjascent()
        serializer = Serializer(tag="sd-tag", format="xml")
        predicted_xml_string = serializer.serialize(b)[0]
        expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">ge ne</sd-tag> or <sd-tag role="intervention" type="protein" role_score="99" type_score="99">others</sd-tag></smtag>'
        #print(predicted_xml_string)
        self.assertEqual(predicted_xml_string, expected_xml_string)

예제 #10

0

파일 보기

 def test_speed(self):
     for i in range(10000):
         tokenize(self.examples[0][0])

예제 #11

0

파일 보기

 def test_tokenize(self):
     for example in self.examples:
         token_list = tokenize(example[0])['token_list']
         token_terms = [t.text for t in token_list]
         #print([f"[{t.start}]{t.text}[{t.stop}]" for t in token_list])
         self.assertEqual(example[1], token_terms)