class TestSandhisplitter(TestCase): def setUp(self): super(TestSandhisplitter, self).setUp() self.model = Model(depth=3, skip=1) self.SS = Sandhisplitter() testcases = resource_filename("sandhisplitter.tests", "resources/samples.txt") self.entries = open(testcases, "r", encoding='utf-8') def test_splits(self): count = 0 entries = map(lambda x: x.strip(), self.entries.readlines()) for line in entries: count += 1 (word, splits, locs) = extract(line) self.model.add_entry(word, splits, locs) m = self.model.serialize() self.SS.set_model(m) for line in entries: (word, splits, locs) = extract(line) obtained, pos = self.SS.split(word) self.assertEqual(locs, pos) self.assertEqual(splits, obtained) def test_details(self): self.assertEqual(self.SS.get_module_name(), "Sandhi-Splitter") self.assertEqual(self.SS.get_info(), "Sandhi-splitter for malayalam") def test_instance(self): self.assertEqual(isinstance(getInstance(), Sandhisplitter), True)
def sandhi_split(token_words): #print("\n Splitted using Sandhi!!\n--------------------------") temp=[] s = Sandhisplitter() for word in token_words: ss=s.split(word) out=ss[0] temp.append(out) print(out) return temp
class Malayalam(BaseMalayalam, object): def __init__(self): super(Malayalam, self).__init__() # Let's give the spellchecker a boost. self.sandhi = Sandhisplitter() def check(self, word): # Trivial case, word is in corpus if super(Malayalam, self).check(word): return True # Sandhisplitter additions # Check for each split word if word exists in corpus # Increases True Positives, Reduces False Negatives words, splits = self.sandhi.split(word) for w in words: if not super(Malayalam, self).check(w): return False return True def suggest(self, word, n=5): # Start with bases suggestions suggestions = super(Malayalam, self).suggest(word, n) # Sandhisplitter additions words, splits = self.sandhi.split(word) corrections = [] for w in words: # Word in dictionary if super(Malayalam, self).check(w): corrections.append([w]) # Word not in dictionary else: corrections.append(super(Malayalam, self).suggest(w, n)) # Cross product to get all possibilities candidates = product(*corrections) # Apply joiner on possibile tuples. for group in candidates: joined = self.sandhi.join(group) suggestions.append(joined) # Scoring via levenstein, sort by levenshtein scores = [] for suggestion in suggestions: score = super(Malayalam, self).levenshtein_distance(suggestion, word) scores.append(score) paired = list(zip(scores, suggestions)) paired.sort() sorted_suggestions = [] for (score, suggestion) in paired: sorted_suggestions.append(suggestion) # Trim off to match n if (len(sorted_suggestions) > n): sorted_suggestions = sorted_suggestions[:n] # And tadaa!!! return sorted_suggestions
class Malayalam(BaseMalayalam, object): def __init__(self): super(Malayalam, self).__init__() # Let's give the spellchecker a boost. self.sandhi = Sandhisplitter() def check(self, word): # Trivial case, word is in corpus if super(Malayalam, self).check(word): return True # Sandhisplitter additions # Check for each split word if word exists in corpus # Increases True Positives, Reduces False Negatives words, splits = self.sandhi.split(word) for w in words: if not super(Malayalam, self).check(w): return False return True def suggest(self, word, n=5): # Start with bases suggestions suggestions = super(Malayalam, self).suggest(word, n) # Sandhisplitter additions words, splits = self.sandhi.split(word) corrections = [] for w in words: # Word in dictionary if super(Malayalam, self).check(w): corrections.append([w]) # Word not in dictionary else: corrections.append(super(Malayalam, self).suggest(w, n)) # Cross product to get all possibilities candidates = product(*corrections) # Apply joiner on possibile tuples. for group in candidates: joined = self.sandhi.join(group) suggestions.append(joined) # Scoring via levenstein, sort by levenshtein scores = [] for suggestion in suggestions: score = super(Malayalam, self).levenshtein_distance( suggestion, word) scores.append(score) paired = list(zip(scores, suggestions)) paired.sort() sorted_suggestions = [] for (score, suggestion) in paired: sorted_suggestions.append(suggestion) # Trim off to match n if (len(sorted_suggestions) > n): sorted_suggestions = sorted_suggestions[:n] # And tadaa!!! return sorted_suggestions