class TestRetagger(unittest.TestCase): def setUp(self): self.r = Retagger() def tearDown(self): self.r = None def test(self): ag = self.r.retag("ag", "Sa") self.assertTrue("ASP" in ag) self.assertEqual(self.r.retag("a'", "Sa"), ["ASP"]) agus = self.r.retag("agus", "Cc") self.assertTrue("CONJ" in agus) air = self.r.retag("air", "Sp") self.assertTrue("PP" in air) self.assertTrue("P" in air) self.assertEqual(self.r.retag("droch", "Ar"), ["DET"]) # not really comma = self.r.retag(",", "Fi") self.assertTrue("PUNC" in comma) fullstop = self.r.retag(".", "Fe") self.assertTrue("PUNC" in fullstop) self.assertEqual(self.r.retag("le", "Sp"), ["P"]) self.assertEqual(self.r.retag("gun", "Qa"), ["GU"]) self.assertEqual(self.r.retag("dìreach", "Rg"), ["ADV"]) self.assertEqual(self.r.retag("Comhairle", "Ncsdf"), ["N"]) self.assertEqual(self.r.retag("galain", "Ncsfn"), ["N"]) self.assertEqual(self.r.retag("an", "Tdsf"), ["DET"]) self.assertEqual(self.r.retag("na", "Tdsfg"), ["DETNMOD"]) self.assertEqual(self.r.retag("[1]", "Xsc"), ["ADVPRE"]) radh = self.r.retag("ràdh", "Nv") self.assertTrue("VPROP" in radh) rinn = self.r.retag("rinn", "V-s") self.assertTrue("TRANS" in rinn) tha = self.r.retag("tha", "V-p") self.assertTrue("BIPP" in tha) self.assertTrue("BIPROG" in tha)
def setUp(self): self.r = Retagger()
else: return s.replace("&", "&") brownfile = open(sys.argv[1], 'rb') corpus = pickle.load(brownfile) brownfile.close() output = open(sys.argv[2], 'w') # features with open("resources/features.txt") as f: for line in f: output.write(line) # type-changing and type-raising rules with open("resources/rules.txt") as r: for line in r: output.write(line) retagger = Retagger() typer = Typer() families = set() words = set() # assumes a single list rather than a list of lists (need to think about this) for surface, pos in corpus: if pos != "": tags = retagger.retag(surface, pos) for tag in tags: newtagtype = typer.type(surface, pos, tag) newtag = newtagtype[0] type = newtagtype[1] families.add("family %s { entry: %s; }" % (newtag, type)) words.add('word "%s_%s":%s; # %s' % (tidyword(surface), newtag, newtag, pos)) for family in sorted(families):