def main(): """Invoke a simple CLI analyser or generator.""" a = ArgumentParser() a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-a', '--analyse', action='store_true', help="Analyse the input file strings") a.add_argument('-g', '--generate', action='store_true', help="Generate the input file strings") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") options = a.parse_args() if not options.infile: options.infile = stdin if options.verbose: print("reading from", options.infile.name) analyser = Analyser() generator = Generator() for line in options.infile: line = line.strip() if not line or line == '': continue if options.analyse: anals = analyser.analyse(line, True) if not anals: print(line, "\t?") for anal in anals: print(line, "\t", anal[0], "\t", anal[1]) if options.generate: gens = generator.generate(line, True) if not gens: print(line, "\t?") for gen in gens: print(line, "\t", gen[0], "\t", gen[1]) print() exit(0)
class CoverageTests(unittest.TestCase): analyser = Analyser() def test_total_coverage(self): total_tokens_count = 0 total_analysed_tokens_count = 0 start = clock() print("%40s\t%8s\t%8s\t%s" % ('File name', 'Words', 'Analysed', 'Percentage')) for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")): with open(filename, 'r') as file: tokens_count = 0 analysed_tokens_count = 0 for line in file: for word in tokenize(line): if not is_valid_malayalam_word(word): continue tokens_count += 1 analysis = self.analyser.analyse(word, False) if len(analysis) > 0: analysed_tokens_count += 1 percentage = (analysed_tokens_count/tokens_count)*100 total_tokens_count += tokens_count total_analysed_tokens_count += analysed_tokens_count print("%40s\t%8d\t%8d\t%3.2f%%" % (os.path.basename(filename), tokens_count, analysed_tokens_count, percentage)) file.close() percentage = (total_analysed_tokens_count/total_tokens_count)*100 time_taken = clock() - start print('%40s\t%8d\t%8d\t%3.2f%%' % ('Total', total_tokens_count, total_analysed_tokens_count, percentage)) print('Time taken: %5.3f seconds' % (time_taken)) self.assertTrue(percentage >= MIN_COVERAGE, 'Coverage decreased from %3.2f to %3.2f' % (MIN_COVERAGE, percentage))
class AnalyserGeneratorTests(unittest.TestCase): generator = Generator() analyser = Analyser() def setUp(self): self.testFile = open(os.path.join(CURR_DIR, 'tests.json')) self.tests = json.load(self.testFile, object_hook=Struct) def tearDown(self): self.testFile.close() def test_analyse(self): print('\t**** Analyse tests ****\t') line = 0 for test in self.tests: line += 1 with self.subTest(test.word): anals = self.analyser.analyse(test.word) match = False if not (hasattr(test, 'skip') and test.skip): self.assertTrue( len(anals) != 0, 'Analysis failed for ' + test.word) else: continue print('%3d %s\t<--\t%s' % (line, test.word, anals)) for index in range(len(anals)): if test.analysis == anals[index][0]: match = True break if not (hasattr(test, 'skip') and test.skip): self.assertEqual(match, True, 'Analysis for ' + test.analysis) def test_generate(self): print('\t**** Generate tests ****\t') line = 0 for test in self.tests: line += 1 with self.subTest(test.word): match = False gens = self.generator.generate(test.analysis, True) if not (hasattr(test, 'skip') and test.skip): self.assertTrue( len(gens) != 0, 'Generate failed for ' + test.analysis) else: continue print('%3d %s\t<--\t%s' % (line, test.analysis, gens)) for index in range(len(gens)): if test.word == gens[index][0]: match = True break if not (hasattr(test, 'skip') and test.skip): self.assertEqual(match, True, 'Generate for ' + test.analysis)
class CoverageTests(unittest.TestCase): analyser = Analyser() def test_total_coverage(self): tokens_count = 0 analysed_tokens_count = 0 missed_words = [] missed_words_file = open('missed_words.txt', 'w') freq_analysis_file = open('freq_analysis.txt', 'w') print('\t**** Coverage tests ****\t\n', end='\n', file=freq_analysis_file) start = clock() for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")): with open(filename, 'r') as file: for line in file: for word in regex.split(r'([\.\s]+)', line): if not is_valid_malayalam_word(word): continue tokens_count += 1 analysis = self.analyser.analyse(word, False) if len(analysis) > 0: analysed_tokens_count += 1 else: missed_words.append(word) missed_words_file.write(word + '\n') missed_words_file.close() percentage = (analysed_tokens_count / tokens_count) * 100 time_taken = clock() - start print('Total words: %d \nAnalysed words: %d \nCoverage: %3.2f %% ' % (tokens_count, analysed_tokens_count, percentage), end='\n', file=freq_analysis_file) print('Time taken: %5.3f seconds' % (time_taken), end='\n', file=freq_analysis_file) most_common = Counter(missed_words).most_common(500) print('Top 500 missed words are:', end='\n', file=freq_analysis_file) for word, freq in most_common: print("%4d %s" % (freq, word), end='\n', file=freq_analysis_file) freq_analysis_file.close()
class CoverageTests(unittest.TestCase): analyser = Analyser() def test_total_coverage(self): print('\t**** Coverage tests ****\t\n') start = time.clock() tokens_count = 0 analysed_tokens_count = 0 missed_words = [] for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")): with open(filename, 'r') as file: for line in file: for word in line.split(): # Ignore all non-Malayalam words if re.compile(r'[a-zA-Z0-9\(\)=\':]').match(word): continue # Ignore all single ligatures. if re.compile(r'^[അ-ഹൺ-ൿ\.][ാിീുെേൊോൗ്ം]*[.?:]*$' ).match(word): continue tokens_count += 1 analysis = self.analyser.analyse(word, False) if len(analysis) > 0: analysed_tokens_count += 1 else: missed_words.append(word) percentage = (analysed_tokens_count / tokens_count) * 100 time_taken = time.clock() - start print('Total words: %d \nAnalysed words: %d \nCoverage: %3.2f %% ' % (tokens_count, analysed_tokens_count, percentage)) print('Time taken: %5.3f seconds' % (time_taken)) most_common = Counter(missed_words).most_common(250) print('Top 250 missed words are:') for word in most_common: print(word)
def __init__(self): self.analyser = Analyser() self.common_mistakes = read_common_mistakes()
class SpellChecker(object): """ The SpellChecker class implements a spelling specker based on Malayalam morphology analyser(mlmorph) """ def __init__(self): self.analyser = Analyser() self.common_mistakes = read_common_mistakes() def strategies(self): return [ "ChilluNormalization", "Ykkuka", "NtaCorrection", "MpaCorrection", "VisualSimilarity", "PhoneticSimilarity", "GeminateConsonants", "ViramaInsertion", "VowelElongation", "VowelShortening", "ChilluToConsonantVirama", "ConsonantViramaToChillu", ] def candidates_from_strategies(self, word: str) -> list: """ Generate possible spelling corrections for the provided word using different strategies Args: word (str): The word for which to calculate candidate spellings Returns: list: The list of words that are possible candidates. \ The list is sorted in descending order of candidate scrores. \ Best candidates are the first candidates in the list. """ # Order of the items in STRATEGIES is important STRATEGIES = self.strategies() weighted_suggestions = {} for class_name in STRATEGIES: strategy = getattr( importlib.import_module("mlmorph_spellchecker.strategies"), class_name)() candidates = Suggestion(strategy).suggest(word) for candidate in candidates: if candidate in weighted_suggestions: continue weighted_analysis = self.analyser.analyse( candidate, True, False) if len(weighted_analysis) > 0: weighted_suggestions[candidate] = weighted_analysis[0][1] # Sort by the increasing order of weights suggestions = sorted(weighted_suggestions.items(), key=lambda t: t[1]) if len(suggestions) == 0: # No suggestions. Try splitting the word after 3rd letter for index in range(3, len(word) - 3): lWord = word[:index] rWord = word[index:] lAnalysis = self.analyser.analyse(lWord, False) rAnalysis = self.analyser.analyse(rWord, False) if len(lAnalysis) > 0 and len(rAnalysis) > 0: suggestions.append([lWord + " " + rWord]) break # Return the words array return [suggestion[0] for suggestion in suggestions] def is_known_to_analyser(self, word: str) -> bool: """ Check if the given word is known for the mlmorph analyser Args: word (str): The word for which to calculate candidate spellings Returns: boolean: Whether the word is known """ analysis = self.analyser.analyse(word, False, True) return len(analysis) > 0 def is_common_mistake(self, word: str) -> bool: """ Check if the given word is a commonly mistaken word based on our database of such words Args: word (str): The word for which to calculate candidate spellings Returns: boolean: Whether the word is known """ return word in self.common_mistakes def spellcheck(self, word: str) -> bool: """ Spellcheck the given word Args: word (str): The word to spell check Returns: boolean: True if words is spelled correctly. False, otherwise. """ common_mistake = self.is_common_mistake(word) if common_mistake: return False return self.is_known_to_analyser(word) def candidates(self, word: str) -> list: """ Generate possible spelling corrections for the provided word Args: word (str): The word for which to calculate candidate spellings Returns: list: The list of words that are possible candidates. \ The list is sorted in descending order of candidate scrores.\ Best candidates are the first candidates in the list """ if self.spellcheck(word): # Word is spelled correctly return [] common_mistake = self.is_common_mistake(word) if common_mistake: return [self.common_mistakes.get(word)] return self.candidates_from_strategies(word)
import regex from flask import Flask, jsonify, request, send_from_directory from mlmorph import Generator, Analyser from mlmorph_spellchecker import SpellChecker app = Flask(__name__, static_folder="./public/", static_url_path='', template_folder="./public") generator = Generator() analyser = Analyser() spellchecker = SpellChecker() @app.route("/<path:path>", defaults={'path': 'index.html'}) def index(path): return send_from_directory('./public', path) @app.route("/api/analyse", methods=['POST', 'GET']) def do_analyse(): text = None analyse_results = {} if request.method == 'POST': text = request.json.get('text') else: text = request.args.get('text') text = text.strip() words = regex.split(r'(\s+)', text)