def testGetPhraseStemsMultiPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 3, 8) # old behaviour: # self.assertListEqual(list(a.getPhraseStems()), ['mask'] * 5) # new behaviour only reports one mask token, not each in a successive row: self.assertListEqual(list(a.getPhraseStems()), ['mask'])
def testAddAnnotationsAreUnique(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type1', 2, 4) self.assertEqual(len(s.annotations), 1) self.assertEqual( sum(len(annotations) for annotations in s.annotations.values()), 1)
def testGetAnnotations(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('type1', 2, 4) a1 = s.addAnnotation('type2', 6) a2 = s.addAnnotation('type2', 8, 9) self.assertEqual(s.getAnnotations('type1'), {a}) self.assertEqual(s.getAnnotations('type2'), {a1, a2})
def testGetPhraseNumber(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('t', 8) a2 = s.addAnnotation('t', 0, 2) a3 = s.addAnnotation('t', 4) self.assertEqual(a1.getPhraseNumber_(), 0) self.assertEqual(a2.getPhraseNumber_(), 1) self.assertEqual(a3.getPhraseNumber_(), 2)
def testGetWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) s.addAnnotation('mask', 8, 9) self.assertListEqual(list(s.words()), ['word%d' % i for i in range(len(s))]) self.assertListEqual(list(s.words(7)), ['word7', 'word8', 'word9'])
def testGetPhraseNumbers(self): s = Sentence(TEST_TOKENS) self.assertListEqual(list(s.phraseNumbers()), [ 1, 2, 3, 4 ]) self.assertListEqual(list(s.phraseNumbers(1, 6)), [ 1, 2 ])
def testIsInsidePhrase(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 3) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('false', 8) self.assertTrue(a1.isInsidePhrase()) self.assertTrue(a2.isInsidePhrase()) self.assertTrue(a3.isInsidePhrase())
def testGetPhraseTags(self): s = Sentence(TEST_TOKENS) self.assertListEqual(list(s.phraseTags()), [ 'NP', 'NP', 'NP', 'NP' ]) self.assertListEqual(list(s.phraseTags(1, 6)), [ 'NP', 'NP' ])
def testGetPhraseTag(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('t', 2) a2 = s.addAnnotation('t', 0, 2) a3 = s.addAnnotation('t', 4) self.assertEqual(a1.getPhraseTag_(), 'O') self.assertEqual(a2.getPhraseTag_(), 'NP') self.assertEqual(a3.getPhraseTag_(), 'NP')
def testPhraseDistanceIfOverlapping(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 1) c = s.addAnnotation('C', 7) d = s.addAnnotation('D', 6, 8) self.assertEqual(a.phraseDistanceTo(b), 0) self.assertEqual(c.phraseDistanceTo(d), 0)
def testVerbPhraseBetweenExactOverlap(self): tokens = list(TEST_TOKENS) tokens[0] = tokens[0].replace(chunk="B-VP") tokens[1] = tokens[1].replace(chunk="I-VP", stem="sentinel") s = Sentence(tokens) this = s.addAnnotation('this', 3) other = s.addAnnotation('other', 3) self.assertEqual(this.verbPhraseBetween(other), None)
def testVerbPhraseBetween(self): tokens = list(TEST_TOKENS) tokens[3] = tokens[3].replace(chunk="B-VP", stem="sentinel1") tokens[4] = tokens[4].replace(chunk="I-VP", stem="sentinel2") s = Sentence(tokens) this = s.addAnnotation('this', 0) other = s.addAnnotation('other', 6) self.assertListEqual(list(this.verbPhraseBetween(other)), ['sentinel1', 'sentinel2'])
def testGetPrepositionedNounPhrase(self): tokens = list(TEST_TOKENS) tokens[3] = tokens[3].replace(chunk="B-PP") tokens[4] = tokens[4].replace(chunk="I-PP") s = Sentence(tokens) s.addAnnotation('sentinel', 0) a = s.addAnnotation('type', 6) self.assertListEqual(list(a.getPrepositionedNounPhrase_()), ['sentinel', 'stem1'])
def testTokenDistanceIfOverlapping(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 2, 4) b = s.addAnnotation('B', 2, 3) c = s.addAnnotation('C', 1, 3) d = s.addAnnotation('D', 0, 2) e = s.addAnnotation('E', 1, 2) for other in [a, b, d, e]: self.assertEqual(c.tokenDistanceTo(other), -1)
def testGetMaskedStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertListEqual(list(s.maskedStems(3, 7)), ['type1', 'stem4', 'stem5', 'type2'])
def testPosTagsTo(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 5) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('true', 3, 5) a0 = s.addAnnotation('true', 0) a9 = s.addAnnotation('true', 9) self.assertEqual(list(a1.posTagsBetween(a2)), ['pos2', 'pos3', 'pos4']) self.assertEqual(list(a1.posTagsBetween(a1)), []) self.assertEqual(list(a2.posTagsBetween(a3)), ['pos2']) self.assertEqual(list(a0.posTagsBetween(a9)), ['pos%d' % i for i in range(1, 9)])
def testPhraseTagsTo(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 5) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('true', 3, 5) a0 = s.addAnnotation('true', 0) a9 = s.addAnnotation('true', 9) self.assertEqual(list(a1.phraseTagsBetween(a2)), ['NP']) self.assertEqual(list(a1.phraseTagsBetween(a1)), []) self.assertEqual(list(a2.phraseTagsBetween(a3)), []) self.assertEqual(list(a0.phraseTagsBetween(a9)), ['NP', 'NP', 'NP'])
def testGetMaskedWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) # should only fetch one masked token ("type1") s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.maxDiff = None self.assertListEqual(list(s.maskedWords()), [ 'word0', 'word1', 'type1', 'word4', 'word5', 'type2', 'word7', 'type2', 'word9' ]) self.assertListEqual(list(s.maskedWords(7)), ['word7', 'type2', 'word9'])
def testGetWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) s.addAnnotation('mask', 8, 9) self.assertListEqual(list(s.words()), [ 'word%d' % i for i in range(len(s)) ]) self.assertListEqual(list(s.words(7)), [ 'word7', 'word8', 'word9' ])
def testGetMaskedStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertListEqual(list(s.maskedStems(3, 7)), [ 'type1', 'stem4', 'stem5', 'type2' ])
def testGetMaskedWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) # should only fetch one masked token ("type1") s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.maxDiff = None self.assertListEqual(list(s.maskedWords()), [ 'word0', 'word1', 'type1', 'word4', 'word5', 'type2', 'word7', 'type2', 'word9' ]) self.assertListEqual(list(s.maskedWords(7)), [ 'word7', 'type2', 'word9' ])
def asDict(sentence: Sentence, ngrams=2): """Convert a :class:`fnl.text.sentence.Sentence` into a feature dictionary.""" d = {'gene-count': sentence.countEntity('B-gene')} stems = list(sentence.maskedStems()) pos = sentence.posTags() tokens = Counter('{}/{}'.format(s, t) for s, t in zip(stems, pos)) d.update(tokens) if "TARGET/NN" in d and "FACTOR/NN" in d: d['has-all-entities'] = 1 gram = list(stems) while ngrams > 1: ngrams =- 1 tokens = Counter('{} {}'.format(s, g) for s, g in zip(stems, gram[1:])) d.update(tokens) return d
def asDict(sentence: Sentence, ngrams=2): """Convert a :class:`fnl.text.sentence.Sentence` into a feature dictionary.""" d = {'gene-count': sentence.countEntity('B-gene')} stems = list(sentence.maskedStems()) pos = sentence.posTags() tokens = Counter('{}/{}'.format(s, t) for s, t in zip(stems, pos)) d.update(tokens) if "TARGET/NN" in d and "FACTOR/NN" in d: d['has-all-entities'] = 1 gram = list(stems) while ngrams > 1: ngrams = -1 tokens = Counter('{} {}'.format(s, g) for s, g in zip(stems, gram[1:])) d.update(tokens) return d
def testAddAnnotation(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertEqual(len(s.annotations), 2) self.assertEqual(set(s.annotations.keys()), {'type1', 'type2'}) self.assertTrue( all( isinstance(annotations, set) for annotations in s.annotations.values())) self.assertEqual( sum(len(annotations) for annotations in s.annotations.values()), 3)
def testTokenDistance(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 3) c = s.addAnnotation('C', 6, 8) d = s.addAnnotation('D', 8) e = s.addAnnotation('E', 4, 6) for other, dist in [(a, 4), (b, 2), (d, 0), (e, 0)]: self.assertEqual(c.tokenDistanceTo(other), dist)
def testPhraseDistance(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 3) c = s.addAnnotation('C', 6, 8) d = s.addAnnotation('D', 9) e = s.addAnnotation('E', 3, 5) for other, dist in [(a, 1), (b, 1), (c, -1), (d, 0), (e, 0)]: self.assertEqual(c.phraseDistanceTo(other), dist, msg=repr(other))
def testAddAnnotation(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertEqual(len(s.annotations), 2) self.assertEqual(set(s.annotations.keys()), {'type1', 'type2'}) self.assertTrue(all(isinstance(annotations, set) for annotations in s.annotations.values())) self.assertEqual(sum(len(annotations) for annotations in s.annotations.values()), 3)
def testGetStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) self.assertListEqual(list(s.stems(3, 7)), [ 'stem3', 'stem4', 'stem5', 'stem6' ])
def testGetPoSTags(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) self.assertListEqual(list(s.posTags(3, 7)), [ 'pos3', 'pos4', 'pos5', 'pos6' ])
def testComparator(self): s = Sentence(TEST_TOKENS) n = Annotation(s, 2, 5) for i in range(1, 7): self.assertTrue(n > Annotation(s, 0, i), i) for i in range(3, 7): self.assertTrue(n < Annotation(s, i, 7), i) self.assertTrue(n > Annotation(s, 1, 6)) self.assertTrue(n < Annotation(s, 3, 4)) self.assertTrue(n == Annotation(s, 2, 5))
def testGetPhraseNumbers(self): s = Sentence(TEST_TOKENS) self.assertListEqual(list(s.phraseNumbers()), [1, 2, 3, 4]) self.assertListEqual(list(s.phraseNumbers(1, 6)), [1, 2])
def testGetPhraseNumber(self): s = Sentence(TEST_TOKENS) tests = [1, 1, 0, 2, 2, 0, 3, 3, 0, 4] for i, n in enumerate(tests): self.assertEqual(s.phraseNumber(i), n)
def testAddAnnotationsAreUnique(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type1', 2, 4) self.assertEqual(len(s.annotations), 1) self.assertEqual(sum(len(annotations) for annotations in s.annotations.values()), 1)
def testEquals(self): s1 = Sentence(TEST_TOKENS) s2 = Sentence(s1) self.assertEqual(s1, s1) self.assertNotEqual(s1, s2)
def testTokenDistanceIfEqual(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 0, 2) self.assertEqual(a.tokenDistanceTo(b), -2)
def testIsNotInsidePhrase(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('false', 1, 3) self.assertFalse(a1.isInsidePhrase())
def testPhraseDistanceIfBothInOverlappingPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 5) b = s.addAnnotation('B', 6, 10) self.assertEqual(a.phraseDistanceTo(b), 0)
def testPhraseDistanceIfBothNotInPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 2) b = s.addAnnotation('B', 5) self.assertEqual(a.phraseDistanceTo(b), 1)
def testTokenDistanceOnDifferentSentences(self): s1 = Sentence(TEST_TOKENS) s2 = Sentence(TEST_TOKENS) a1 = s1.addAnnotation('type', 0, 2) a2 = s2.addAnnotation('type', 6, 8) self.assertRaises(ValueError, a1.tokenDistanceTo, a2)
def testGetPhraseOffset(self): s = Sentence(TEST_TOKENS) tests = [(1, (0, 2)), (2, (3, 5)), (3, (6, 8)), (4, (9, 10))] for number, offset in tests: self.assertEqual(s.phraseOffsetFor(number), offset)