class TestSplitter(TestCase):
    def setUp(self):
        super(TestSplitter, self).setUp()
        self.testModel = Model(depth=3, skip=1)
        testcases = resource_filename("sandhisplitter.tests",
                                      "resources/samples.txt")
        self.entries = open(testcases, "r", encoding='utf-8')

    def test_load(self):
        count = 0
        firstline = None
        for line in self.entries:
            count += 1
            if count == 1:
                firstline = line
            (word, splits, locs) = extract(line)
            self.testModel.add_entry(word, splits, locs)
        m = self.testModel.serialize()
        self.testModel.load(m)
        self.splitter = Splitter(m)
        # Test probale splits
        (word, splits, locs) = extract(firstline)
        locs = list(locs)
        sps = self.splitter.splits(word)
        self.assertEqual(sps, locs)
예제 #2
0
 def setUp(self):
     super(TestSandhisplitter, self).setUp()
     self.model = Model(depth=3, skip=1)
     self.SS = Sandhisplitter()
     testcases = resource_filename("sandhisplitter.tests",
                                   "resources/samples.txt")
     self.entries = open(testcases, "r", encoding='utf-8')
예제 #3
0
class TestSandhisplitter(TestCase):
    def setUp(self):
        super(TestSandhisplitter, self).setUp()
        self.model = Model(depth=3, skip=1)
        self.SS = Sandhisplitter()
        testcases = resource_filename("sandhisplitter.tests",
                                      "resources/samples.txt")
        self.entries = open(testcases, "r", encoding='utf-8')

    def test_splits(self):
        count = 0
        entries = map(lambda x: x.strip(), self.entries.readlines())
        for line in entries:
            count += 1
            (word, splits, locs) = extract(line)
            self.model.add_entry(word, splits, locs)
        m = self.model.serialize()
        self.SS.set_model(m)
        for line in entries:
            (word, splits, locs) = extract(line)
            obtained, pos = self.SS.split(word)
            self.assertEqual(locs, pos)
            self.assertEqual(splits, obtained)

    def test_details(self):
        self.assertEqual(self.SS.get_module_name(), "Sandhi-Splitter")
        self.assertEqual(self.SS.get_info(), "Sandhi-splitter for malayalam")

    def test_instance(self):
        self.assertEqual(isinstance(getInstance(), Sandhisplitter), True)
예제 #4
0
def main():
    # if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description="Train a model")
    arguments = [
        ["-k", "--depth", "depth of the trie", int, "depth"],
        ["-s", "--skip", "initial skip", int, "skip"],
        ["-i", "--trainfile", "path to training file", str, "trainfile"],
        ["-o", "--outputfile", "path to store model", str, "modelfile"],
    ]

    # Add options
    for arg in arguments:
        unix, gnu, desc, typename, dest = arg
        parser.add_argument(unix,
                            gnu,
                            help=desc,
                            type=typename,
                            required=True,
                            dest=dest)

    args = parser.parse_args()

    # Load training file and add entries to model
    data = open(args.trainfile, "r", encoding="utf-8")
    line_number = 0
    model = Model(depth=args.depth, skip=args.skip)
    try:
        for line in data:
            line = line.strip()
            line_number += 1
            word, splits, locs = extract(line)
            model.add_entry(word, splits, locs)
    except:
        print("Input file syntax error in line %d" % (line_number))
        raise

    # Serialize the model and export to file
    exported = model.serialize()
    output_file = open(args.modelfile, "w", encoding="utf-8")
    result = json.dumps(exported, ensure_ascii=False)
    output_file.write(result)
예제 #5
0
class TestModel(TestCase):
    def setUp(self):
        super(TestModel, self).setUp()
        self.testModel = Model(depth=3, skip=1)
        testcases = resource_filename("sandhisplitter.tests",
                                      "resources/samples.txt")
        self.entries = open(testcases, "r", encoding='utf-8')

    def test_load(self):
        count = 0
        firstline = None
        for line in self.entries:
            count += 1
            if count == 1:
                firstline = line
            (word, splits, locs) = extract(line)
            self.testModel.add_entry(word, splits, locs)
        m = self.testModel.serialize()
        self.testModel.load(m)
        self.assertEqual(self.testModel.k, 3)
        self.assertEqual(self.testModel.initial_skip, 1)
        self.assertEqual(self.testModel.k, m["k"])
        self.assertEqual(self.testModel.initial_skip, m["initial_skip"])
        # Test probale splits
        (word, splits, locs) = extract(firstline)
        locs = list(locs)
        sps = self.testModel.probable_splits(word)
        self.assertEqual(sps, locs)

    def test_error(self):
        self.assertRaises(ValueError, Model, "what")
예제 #6
0
def main():
    # if __name__ == '__main__':  # pragma: no cover
    parser = argparse.ArgumentParser(description="Train a model")
    arguments = [
        ["-k", "--depth", "depth of the trie", int, "depth"],
        ["-s", "--skip", "initial skip", int, "skip"],
        ["-i", "--trainfile", "path to training file",
            str, "trainfile"],
        ["-o", "--outputfile", "path to store model",
            str, "modelfile"],
    ]

    # Add options
    for arg in arguments:
        unix, gnu, desc, typename, dest = arg
        parser.add_argument(unix, gnu, help=desc, type=typename,
                            required=True, dest=dest)

    args = parser.parse_args()

    # Load training file and add entries to model
    data = open(args.trainfile, "r", encoding="utf-8")
    line_number = 0
    model = Model(depth=args.depth, skip=args.skip)
    try:
        for line in data:
            line = line.strip()
            line_number += 1
            word, splits, locs = extract(line)
            model.add_entry(word, splits, locs)
    except:
        print("Input file syntax error in line %d" % (line_number))
        raise

    # Serialize the model and export to file
    exported = model.serialize()
    output_file = open(args.modelfile, "w", encoding="utf-8")
    result = json.dumps(exported, ensure_ascii=False)
    output_file.write(result)
예제 #7
0
 def setUp(self):
     super(TestModel, self).setUp()
     self.testModel = Model(depth=3, skip=1)
     testcases = resource_filename("sandhisplitter.tests",
                                   "resources/samples.txt")
     self.entries = open(testcases, "r", encoding='utf-8')
예제 #8
0
 def __init__(self, model):
     self.M = Model(model=model)
예제 #9
0
class Splitter:
    def __init__(self, model):
        self.M = Model(model=model)

    def splits(self, word):
        return self.M.probable_splits(word)