def testNoApply(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><February~.+><(\d{4})~.+>', 'date', 'testNoApply', r'{#2} + "01" + {#1}') t = Timex(type='date') self.assertFalse( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.value, None)
def testApplyCorrectType(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyCorrectType', r'{#2} + "01" + {#1}') t = Timex(type='time') self.assertFalse( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
def testRaiseError(self): rules = [ NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testRaiseError1', r'{#2} + "01" + {#1}'), NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testRaiseError2', r'{#2} + "02" + {#1}') ] self.assertRaises(RuleLoadError, NormalisationRuleBlock, None, [], 'invalid', rules)
def testApplyInsensitive(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><january~.+><(\d{4})~.+>', 'date', 'testApplyInsensitive', r'{#2} + "01" + {#1}') t = Timex(type='date') self.assertTrue( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.value, '19960106')
def testApplyFreq(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyFreq', freq=r'"1D"') t = Timex(type='date') self.assertTrue( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.freq, '1D')
def testApplyQuant(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyQuant', quant=r'"EVERY"') t = Timex(type='date') self.assertTrue( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.quant, 'EVERY')
def testApplyChangeType(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyChangeType', change_type=r'"non-date"') t = Timex(type='date') self.assertTrue( rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.type, 'non-date')
def testApplyAll(self): rules = [ NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyAll1', r'{#2} + "01" + {#1}'), NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyAll2', r'{#2} + "02" + {#1}') ] b = NormalisationRuleBlock(None, [], 'all', rules) t = Timex(type='date') self.assertTrue( b.apply(t, '', '', [('06', 'POS', {t}), ('th', 'POS', {t}), ('January', 'POS', {t}), ('1996', 'POS', {t})], [], [])[0]) self.assertEquals(t.value, '19960206')
def testApplyUntilSuccess2(self): rules = [ NormalisationRule(r'<(\d+)~.+><th~.+><February~.+><(\d{4})~.+>', 'date', 'testApplyUntilSuccess2A', r'{#2} + "02" + {#1}'), NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyUntilSuccess2B', r'{#2} + "01" + {#1}') ] b = NormalisationRuleBlock(None, [], 'until-success', rules) t = Timex(type='date') self.assertTrue( b.apply(t, '', '', [('06', 'POS', {t}), ('th', 'POS', {t}), ('January', 'POS', {t}), ('1996', 'POS', {t})], [], [])[0]) self.assertEquals(t.value, '19960106')
def testNegAfterBlocks(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testNegAfterBlocks', r'{#2} + "01" + {#1}', after_guards=[r'!<to~.+><Atlanta~.+>']) t = Timex(type='date') (before, body, after) = ([('We', 'POS', set()), ('took', 'POS', set()), ('a', 'POS', set()), ('plane', 'POS', set()), ('on', 'POS', set()), ('the', 'POS', set())], [ ('06', 'POS', set()), ('th', 'POS', set()), ('January', 'POS', set()), ('1996', 'POS', set()) ], [('to', 'POS', set()), ('Atlanta', 'POS', set())]) self.assertFalse(rule.apply(t, '', '', body, before, after)[0])
def testPosGuardBlocks(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testPosGuardBlocks', r'{#2} + "01" + {#1}', guards = [r'<th~.+><February~.+>']) t = Timex(type='date') (before, body, after) = ( [('We', 'POS', set()), ('took', 'POS', set()), ('a', 'POS', set()), ('plane', 'POS', set()), ('on', 'POS', set()), ('the', 'POS', set())], [('06', 'POS', set()), ('th', 'POS', set()), ('January', 'POS', set()), ('1996', 'POS', set())], [('to', 'POS', set()), ('Atlanta', 'POS', set())] ) self.assertFalse(rule.apply(t, '', '', body, before, after)[0])
def testPosGuardAllows(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testPosGuardAllows', r'{#2} + "01" + {#1}', guards=[r'<th~.+><January~.+>']) t = Timex(type='date') (before, body, after) = ([('We', 'POS', set()), ('took', 'POS', set()), ('a', 'POS', set()), ('plane', 'POS', set()), ('on', 'POS', set()), ('the', 'POS', set())], [ ('06', 'POS', set()), ('th', 'POS', set()), ('January', 'POS', set()), ('1996', 'POS', set()) ], [('to', 'POS', set()), ('Atlanta', 'POS', set())]) self.assertTrue(rule.apply(t, '', '', body, before, after)[0]) self.assertEquals(t.value, '19960106')
def testNegAfterAllows(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testNegAfterAllows', r'{#2} + "01" + {#1}', after_guards = [r'!<a~.+><plane~.+>']) t = Timex(type='date') (before, body, after) = ( [('We', 'POS', set()), ('took', 'POS', set()), ('a', 'POS', set()), ('plane', 'POS', set()), ('on', 'POS', set()), ('the', 'POS', set())], [('06', 'POS', set()), ('th', 'POS', set()), ('January', 'POS', set()), ('1996', 'POS', set())], [('to', 'POS', set()), ('Atlanta', 'POS', set())] ) self.assertTrue(rule.apply(t, '', '', body, before, after)[0]) self.assertEquals(t.value, '19960106')
def _load_rule(self, filename, rulelines): """ Load a 'simple' normalisation rule """ # get key/value dictionaries d = self._parse_rule(filename, rulelines) # Set defaults type = None match = None id = filename value = None guards = [] before_guards = [] after_guards = [] sent_guards = [] after = [] tokenise = True deliminate_numbers = False change_type = None freq = None quant = None mod = None for key in d: # Only one 'Type' field allowed if key == 'type': if len(d[key]) != 1: raise RuleLoadError(filename, "Too many 'Type' field") else: type = d[key][0] # Only one 'Match' field allowed elif key == 'match': if len(d[key]) != 1: raise RuleLoadError(filename, "There must be exactly 1 'Match' field") else: match = d[key][0] # No more than one ID key allowed elif key == 'id': if len(d[key]) == 1: id = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'ID' fields") # No more than one Value key allowed elif key == 'value': if len(d[key]) == 1: value = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Value' fields") # No more than one Change-Type key allowed elif key == 'change-type': if len(d[key]) == 1: change_type = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Change-Type' fields") # No more than one Freq key allowed elif key == 'freq': if len(d[key]) == 1: freq = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Freq' fields") # No more than one Quant key allowed elif key == 'quant': if len(d[key]) == 1: quant = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Quant' fields") # No more than one Mod key allowed elif key == 'mod': if len(d[key]) == 1: mod = d[key][0] elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Mod' fields") # set optional fields elif key == 'guard': guards = d[key] elif key == 'after': after = d[key] elif key == 'before-guard': before_guards = d[key] elif key == 'after-guard': after_guards = d[key] elif key == 'sent-guard': sent_guards = d[key] elif key == 'tokenise': if len(d[key]) == 1: tokenise = d[key][0].lower() if tokenise == 'true': tokenise = True elif tokenise == 'space': tokenise = ' ' elif tokenise == 'null': tokenise = '' elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Tokenise' fields") # Deliminate-Numbers is an optional field, defaulting to False, which # accepts either true or false (case-insensitive) as values elif key == 'deliminate-numbers': if len(d[key]) == 1: deliminate_numbers = d[key][0].lower() if deliminate_numbers == 'true': deliminate_numbers = True elif deliminate_numbers == 'false': deliminate_numbers = False else: raise RuleLoadError(filename, "Deliminate-Numbers must be either 'True' or 'False'") elif len(d[key]) > 1: raise RuleLoadError(filename, "Too many 'Deliminate-Numbers' fields") # error on unknown fields else: raise RuleLoadError(filename, "Unknown field '" + key + "'") if match is None: raise RuleLoadError(filename, "'Match' is a compulsory field") if deliminate_numbers and not tokenise: raise RuleLoadError(filename, "'Deliminate-Numbers' can not be set if Tokenise is") # Guard against any RE errors try: return NormalisationRule(match, type, id, value, change_type, freq, quant, mod, guards, after_guards, before_guards, sent_guards, after, tokenise, deliminate_numbers) except re.error as e: raise RuleLoadError(filename, "Malformed regular expression: " + str(e))
def testApplyChangeType(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyChangeType', change_type=r'"non-date"') t = Timex(type='date') self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.type, 'non-date')
def testApplyFreq(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyFreq', freq=r'"1D"') t = Timex(type='date') self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.freq, '1D')
def testApplyQuant(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyQuant', quant=r'"EVERY"') t = Timex(type='date') self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.quant, 'EVERY')
def testApplyCorrectType(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyCorrectType', r'{#2} + "01" + {#1}') t = Timex(type='time') self.assertFalse(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0])
def testApplyValue(self): rule = NormalisationRule(r'<(\d+)~.+><th~.+><January~.+><(\d{4})~.+>', 'date', 'testApplyValue', r'{#2} + "01" + {#1}') t = Timex(type='date') self.assertTrue(rule.apply(t, '', '', [('06', 'POS', set([t])), ('th', 'POS', set([t])), ('January', 'POS', set([t])), ('1996', 'POS', set([t]))], [], [])[0]) self.assertEquals(t.value, '19960106')