def test_serializer_1(self): ''' Simple test to tag 2 words. ''' input_string = 'A gene or protein.' prediction = torch.Tensor([ [ #A g e n e o r p r o t e i n . [ 0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99, 0 ] ] ]) #self, text_examples, prediction, output_semantics b = Decoder([input_string], prediction, Catalogue.from_list( ['gene', 'small_molecule', 'tissue', 'protein'])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(b)[0] expected_xml_string = '<smtag>A <sd-tag type="gene" type_score="99">gene</sd-tag> or <sd-tag type="protein" type_score="99">protein</sd-tag>.</smtag>' #expected_html_string = 'A <span class="sd-tag gene">gene</span> or <span class="sd-tag protein">protein</span>.' #print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_detokenize(self): for example in self.examples: token_list = tokenize(example[0])['token_list'] humpty_dumpty = ''.join( ["{}{}".format(t.left_spacer, t.text) for t in token_list]) #print(humpty_dumpty) self.assertEqual(example[0], humpty_dumpty)
def test_serializer_updatexml(self): ''' Test the update of a pretagged xml object ''' xml_string = '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag type="protein">others</sd-tag></sd-panel>' xml = fromstring(xml_string) expected_xml_string = tostring( fromstring( '<sd-panel>A <sd-tag type="geneprod">ge ne</sd-tag> or <sd-tag role="intervention" type="protein" role_score="99">others</sd-tag></sd-panel>' )) input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ] ] ]) #self, text_examples, prediction, output_semantics b = Binarized([input_string], prediction, Catalogue.from_list(['intervention'])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() updatexml_(xml, b) resulting_xml_string = tostring(xml) print(resulting_xml_string) self.assertEqual(expected_xml_string, resulting_xml_string)
def test_serializer_3(self): ''' Testing tagging with staggered features and xml escaping. ''' input_string = 'A gene or oth>rs' prediction = torch.Tensor([ [ #A g e n e o r o t h > r s [ 0, 0, 0.99, 0.99, 0.99, 0.99, 0, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ] ] ]) #self, text_examples, prediction, output_semantics b = Binarized([input_string], prediction, Catalogue.from_list( ['geneprod', 'assayed', 'intervention', 'protein'])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() for _ in range(100): serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(b)[0] expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">gene</sd-tag> <sd-tag type="geneprod" role="assayed" type_score="99" role_score="99">or</sd-tag> <sd-tag role="intervention" type="protein" role_score="99" type_score="99">oth>rs</sd-tag></smtag>' print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_fuse_adjascent_2(self): ''' Testing the fusion of two terms at the end of the string. ''' input_string = 'A ge n' prediction = torch.Tensor([[ #A g e n [0, 0, 0.99, 0.99, 0.6, 0.99] ]]) expected_start = torch.Tensor([[[0., 0., 1., 0., 0., 0.]]]) expected_stop = torch.Tensor([[[0., 0., 0., 0., 0., 1.]]]) expected_marks = torch.Tensor([[[0., 0., 1., 1., 1., 1.]]]) b = Binarized([input_string], prediction, [Catalogue.GENEPROD]) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() print("\n fuse at the end") print("".join( [str(int(x)) for x in list(b.start.view(b.start.numel()))])) print("".join([str(int(x)) for x in list(b.stop.view(b.stop.numel()))])) print("".join( [str(int(x)) for x in list(b.marks.view(b.marks.numel()))])) print(",".join( [str(int(x)) for x in list(b.score.view(b.marks.numel()))])) self.assertTensorEqual(expected_start, b.start) self.assertTensorEqual(expected_stop, b.stop) self.assertTensorEqual(expected_marks, b.marks)
def test_binarize(self): ''' Testing the binarization without fusion. ''' input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e e r s [ 0, 0, 0.99, 0.99, 0, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] ]) expected_start = torch.Tensor([[[ 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) expected_stop = torch.Tensor([[[ 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) expected_marks = torch.Tensor([[[ 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) b = Binarized([input_string], prediction, [Catalogue.GENEPROD]) token_list = tokenize(input_string) b.binarize_with_token([token_list]) print("\n") print("".join( [str(int(x)) for x in list(b.start.view(b.start.numel()))])) print("".join([str(int(x)) for x in list(b.stop.view(b.stop.numel()))])) print("".join( [str(int(x)) for x in list(b.marks.view(b.marks.numel()))])) print(",".join( [str(int(x)) for x in list(b.score.view(b.marks.numel()))])) self.assertTensorEqual(expected_start, b.start) self.assertTensorEqual(expected_stop, b.stop) self.assertTensorEqual(expected_marks, b.marks)
def test_serializer_4(self): ''' Testing tagging of ambiguous predictions "others" as both intervention and assayed (with lower score) ''' input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [ 0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98 ] ] ]) #self, text_examples, prediction, output_semantics b = Binarized([input_string], prediction, Catalogue.from_list([ 'geneprod', 'small_molecule', 'intervention', 'assayed' ])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(b)[0] expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">ge ne</sd-tag> or <sd-tag type="small_molecule" role="intervention" type_score="99" role_score="99">others</sd-tag></smtag>' #print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_fuse_adjascent_1(self): ''' Testing the fusion between two similarly labeled terms separated by a tab. ''' input_string = 'A\tge\tne\tor\tothers' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [ 0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] ]) expected_start = torch.Tensor([[[ 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) expected_stop = torch.Tensor([[[ 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) expected_marks = torch.Tensor([[[ 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. ]]]) b = Binarized([input_string], prediction, [Catalogue.GENEPROD]) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() print("\nFuse with spacer") print("".join( [str(int(x)) for x in list(b.start.view(b.start.numel()))])) print("".join([str(int(x)) for x in list(b.stop.view(b.stop.numel()))])) print("".join( [str(int(x)) for x in list(b.marks.view(b.marks.numel()))])) print(",".join( [str(int(x)) for x in list(b.score.view(b.marks.numel()))])) self.assertTensorEqual(expected_start, b.start) self.assertTensorEqual(expected_stop, b.stop) self.assertTensorEqual(expected_marks, b.marks)
def test_serializer_2(self): ''' Testing tagging of multiple token ("ge ne" as type="gene") and multiple attributes for one terms ("others" as role="intervention" type="protein") ''' input_string = 'A ge ne or others' prediction = torch.Tensor([ [ #A g e n e o r o t h e r s [ 0, 0, 0.99, 0.99, 0.6, 0.99, 0.99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.99, 0.99, 0.99, 0.99, 0.99, 0.99 ] ] ]) #self, text_examples, prediction, output_semantics b = Binarized([input_string], prediction, Catalogue.from_list([ 'geneprod', 'small_molecule', 'intervention', 'protein' ])) token_list = tokenize(input_string) b.binarize_with_token([token_list]) b.fuse_adjascent() serializer = Serializer(tag="sd-tag", format="xml") predicted_xml_string = serializer.serialize(b)[0] expected_xml_string = '<smtag>A <sd-tag type="geneprod" type_score="99">ge ne</sd-tag> or <sd-tag role="intervention" type="protein" role_score="99" type_score="99">others</sd-tag></smtag>' #print(predicted_xml_string) self.assertEqual(predicted_xml_string, expected_xml_string)
def test_speed(self): for i in range(10000): tokenize(self.examples[0][0])
def test_tokenize(self): for example in self.examples: token_list = tokenize(example[0])['token_list'] token_terms = [t.text for t in token_list] #print([f"[{t.start}]{t.text}[{t.stop}]" for t in token_list]) self.assertEqual(example[1], token_terms)