def testManyIdenticalCountPrefixes(self): """ If many substrings' true positive fractions are the same as their subsubstrings (that are one character shorter), none of the longer substrings should appear in the results. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 5, 'inputCount': 7, 'notEnoughTruePositives': 0, 'substrings': [ ('abc', (21, 21, 0.5)), ('d', (10, 40, 0.2)), ], }, selectSubstringsForAhoCorasick([ 'd 10 40', 'abc 21 21', 'abcd 21 21', 'abcde 21 21', 'abcdef 21 21', 'abcdefg 21 21', 'abcdefgh 21 21', ]))
def testMaxSubstringsNonZero(self): """ A passed non-zero maxSubstrings value must be respected. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 8, 'notEnoughTruePositives': 0, 'substrings': [ ('best', (10, 0, 1.0)), ('abc', (20, 20, 0.5)), ('defg', (20, 60, 0.25)), ('hijkl', (20, 60, 0.25)), ], }, selectSubstringsForAhoCorasick([ 'abc 20 20', 'hijkl 20 60', 'defg 20 60', 'worst 1 9', 'best 10 0', 'stuvwx 20 60', 'mnopqr 20 60', 'm12345 20 60', ], maxSubstrings=4))
def testEmpty(self): """ If no substrings are passed, the resulting substrings list must be empty and the counts must be zero. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 0, 'notEnoughTruePositives': 0, 'substrings': [], }, selectSubstringsForAhoCorasick([]))
def testIdenticalFractionSubstringTwoShorter(self): """ If a substring's true positive fraction is the same as that of a subsubstring (that is two characters shorter than the substring), the substring should not appear in the results. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 1, 'inputCount': 2, 'notEnoughTruePositives': 0, 'substrings': [ ('abc', (21, 21, 0.5)), ], }, selectSubstringsForAhoCorasick([ 'abc 21 21', 'abcde 21 21', ]))
def testAllowAll(self): """ If there is no restriction on number or fraction of true positives all substrings must be returned. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 2, 'notEnoughTruePositives': 0, 'substrings': [ ('abc', (21, 21, 0.5)), ('def', (20, 80, 0.2)), ], }, selectSubstringsForAhoCorasick([ 'abc 21 21', 'def 20 80', ]))
def testNonIdenticalFractionSubstringOneShorter(self): """ If a substring's true positive fraction is better than that of one of its substrings (that is one character shorter than the substring), the substring should appear in the results. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 2, 'notEnoughTruePositives': 0, 'substrings': [ ('abcd', (21, 21, 0.5)), ('abc', (20, 60, 0.25)), ], }, selectSubstringsForAhoCorasick([ 'abc 20 60', 'abcd 21 21', ]))
def testTruePositiveFraction(self): """ If there is a restriction on the fraction of true positives the expected result must be returned. """ self.assertEqual( { 'fractionTooLow': 2, 'inferior': 0, 'inputCount': 3, 'notEnoughTruePositives': 0, 'substrings': [ ('abc', (21, 21, 0.5)), ], }, selectSubstringsForAhoCorasick([ 'abc 21 21', 'def 20 80', 'ghi 10 80', ], minTruePositiveFraction=0.3))
def testMaxSubstringsZero(self): """ A passed zero maxSubstrings value must be respected. """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 8, 'notEnoughTruePositives': 0, 'substrings': [], }, selectSubstringsForAhoCorasick([ 'abc 20 20', 'hijkl 20 60', 'defg 20 60', 'worst 1 9', 'best 10 0', 'stuvwx 20 60', 'mnopqr 20 60', 'm12345 20 60', ], maxSubstrings=0))
def testTruePositiveCountAndFraction(self): """ If there is a restriction on both the number and fraction of true positives the expected result must be returned. """ self.assertEqual( { 'fractionTooLow': 1, 'inferior': 0, 'inputCount': 4, 'notEnoughTruePositives': 2, 'substrings': [ ('jkl', (30, 10, 0.75)), ], }, selectSubstringsForAhoCorasick([ 'abc 21 21', 'def 20 80', 'ghi 10 80', 'jkl 30 10', ], minTruePositives=21, minTruePositiveFraction=0.7))
def testSort(self): """ Returned substrings must be sorted on true positive fraction (decreasing), length (increasing), and then alphabetically (increasing). """ self.assertEqual( { 'fractionTooLow': 0, 'inferior': 0, 'inputCount': 8, 'notEnoughTruePositives': 0, 'substrings': [ ('best', (10, 0, 1.0)), ('abc', (20, 20, 0.5)), ('defg', (20, 60, 0.25)), ('hijkl', (20, 60, 0.25)), ('m12345', (20, 60, 0.25)), ('mnopqr', (20, 60, 0.25)), ('stuvwx', (20, 60, 0.25)), ('worst', (1, 9, 0.1)), ], }, selectSubstringsForAhoCorasick([ 'abc 20 20', 'hijkl 20 60', 'defg 20 60', 'worst 1 9', 'best 10 0', 'stuvwx 20 60', 'mnopqr 20 60', 'm12345 20 60', ]))
parser.add_argument( '--printCounts', default=False, action='store_true', help=('If True, the true positive count, false positive count, and true ' 'positive count / (false positive count + true positive count) ' 'fraction will be printed after each substring.')) parser.add_argument( '--printSummary', default=False, action='store_true', help=('If True, print a summary of substring processing to show how many ' 'substrings were considered and what their fates were.')) args = parser.parse_args() result = selectSubstringsForAhoCorasick(sys.stdin, args.minTruePositives, args.minTruePositiveFraction, args.maxSubstrings) if args.printCounts: for substring, counts in result['substrings']: print('%s %d %d %f' % (substring, counts[0], counts[1], counts[2])) else: for substring, _ in result['substrings']: print(substring) if args.printSummary: printSummary(result)