示例#1
0
def main():
    """Invoke a simple CLI analyser or generator."""
    a = ArgumentParser()
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-a',
                   '--analyse',
                   action='store_true',
                   help="Analyse the input file strings")
    a.add_argument('-g',
                   '--generate',
                   action='store_true',
                   help="Generate the input file strings")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    options = a.parse_args()

    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("reading from", options.infile.name)
    analyser = Analyser()
    generator = Generator()
    for line in options.infile:
        line = line.strip()
        if not line or line == '':
            continue
        if options.analyse:
            anals = analyser.analyse(line, True)
            if not anals:
                print(line, "\t?")
            for anal in anals:
                print(line, "\t", anal[0], "\t", anal[1])
        if options.generate:
            gens = generator.generate(line, True)
            if not gens:
                print(line, "\t?")
            for gen in gens:
                print(line, "\t", gen[0], "\t", gen[1])

    print()
    exit(0)
示例#2
0
class CoverageTests(unittest.TestCase):
    analyser = Analyser()

    def test_total_coverage(self):
        total_tokens_count = 0
        total_analysed_tokens_count = 0
        start = clock()
        print("%40s\t%8s\t%8s\t%s" % ('File name', 'Words', 'Analysed', 'Percentage'))
        for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")):
            with open(filename, 'r') as file:
                tokens_count = 0
                analysed_tokens_count = 0
                for line in file:
                    for word in tokenize(line):
                        if not is_valid_malayalam_word(word):
                            continue
                        tokens_count += 1
                        analysis = self.analyser.analyse(word, False)
                        if len(analysis) > 0:
                            analysed_tokens_count += 1
                percentage = (analysed_tokens_count/tokens_count)*100
                total_tokens_count += tokens_count
                total_analysed_tokens_count += analysed_tokens_count
                print("%40s\t%8d\t%8d\t%3.2f%%" % (os.path.basename(filename), tokens_count, analysed_tokens_count, percentage))
                file.close()
        percentage = (total_analysed_tokens_count/total_tokens_count)*100
        time_taken = clock() - start
        print('%40s\t%8d\t%8d\t%3.2f%%' %
              ('Total', total_tokens_count, total_analysed_tokens_count, percentage))
        print('Time taken: %5.3f seconds' % (time_taken))
        self.assertTrue(percentage >= MIN_COVERAGE,
                        'Coverage decreased from %3.2f to %3.2f' % (MIN_COVERAGE, percentage))
示例#3
0
class AnalyserGeneratorTests(unittest.TestCase):
    generator = Generator()
    analyser = Analyser()

    def setUp(self):
        self.testFile = open(os.path.join(CURR_DIR, 'tests.json'))
        self.tests = json.load(self.testFile, object_hook=Struct)

    def tearDown(self):
        self.testFile.close()

    def test_analyse(self):
        print('\t**** Analyse tests ****\t')
        line = 0
        for test in self.tests:
            line += 1
            with self.subTest(test.word):
                anals = self.analyser.analyse(test.word)
                match = False
                if not (hasattr(test, 'skip') and test.skip):
                    self.assertTrue(
                        len(anals) != 0, 'Analysis failed for ' + test.word)
                else:
                    continue
                print('%3d %s\t<--\t%s' % (line, test.word, anals))
                for index in range(len(anals)):
                    if test.analysis == anals[index][0]:
                        match = True
                        break
                if not (hasattr(test, 'skip') and test.skip):
                    self.assertEqual(match, True,
                                     'Analysis for ' + test.analysis)

    def test_generate(self):
        print('\t**** Generate tests ****\t')
        line = 0
        for test in self.tests:
            line += 1
            with self.subTest(test.word):
                match = False
                gens = self.generator.generate(test.analysis, True)
                if not (hasattr(test, 'skip') and test.skip):
                    self.assertTrue(
                        len(gens) != 0, 'Generate failed for ' + test.analysis)
                else:
                    continue
                print('%3d %s\t<--\t%s' % (line, test.analysis, gens))
                for index in range(len(gens)):
                    if test.word == gens[index][0]:
                        match = True
                        break
                if not (hasattr(test, 'skip') and test.skip):
                    self.assertEqual(match, True,
                                     'Generate for ' + test.analysis)
示例#4
0
class CoverageTests(unittest.TestCase):
    analyser = Analyser()

    def test_total_coverage(self):

        tokens_count = 0
        analysed_tokens_count = 0
        missed_words = []
        missed_words_file = open('missed_words.txt', 'w')
        freq_analysis_file = open('freq_analysis.txt', 'w')

        print('\t**** Coverage tests ****\t\n',
              end='\n',
              file=freq_analysis_file)

        start = clock()
        for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")):
            with open(filename, 'r') as file:
                for line in file:
                    for word in regex.split(r'([\.\s]+)', line):
                        if not is_valid_malayalam_word(word):
                            continue
                        tokens_count += 1
                        analysis = self.analyser.analyse(word, False)
                        if len(analysis) > 0:
                            analysed_tokens_count += 1
                        else:
                            missed_words.append(word)
                            missed_words_file.write(word + '\n')
        missed_words_file.close()
        percentage = (analysed_tokens_count / tokens_count) * 100
        time_taken = clock() - start
        print('Total words: %d \nAnalysed words: %d \nCoverage: %3.2f %% ' %
              (tokens_count, analysed_tokens_count, percentage),
              end='\n',
              file=freq_analysis_file)
        print('Time taken: %5.3f seconds' % (time_taken),
              end='\n',
              file=freq_analysis_file)

        most_common = Counter(missed_words).most_common(500)
        print('Top 500 missed words are:', end='\n', file=freq_analysis_file)
        for word, freq in most_common:
            print("%4d %s" % (freq, word), end='\n', file=freq_analysis_file)
        freq_analysis_file.close()
示例#5
0
class CoverageTests(unittest.TestCase):
    analyser = Analyser()

    def test_total_coverage(self):
        print('\t**** Coverage tests ****\t\n')
        start = time.clock()
        tokens_count = 0
        analysed_tokens_count = 0
        missed_words = []

        for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")):
            with open(filename, 'r') as file:
                for line in file:
                    for word in line.split():
                        # Ignore all non-Malayalam words
                        if re.compile(r'[a-zA-Z0-9\(\)=\':]').match(word):
                            continue
                        # Ignore all single ligatures.
                        if re.compile(r'^[അ-ഹൺ-ൿ\.][ാിീുെേൊോൗ്ം]*[.?:]*$'
                                      ).match(word):
                            continue
                        tokens_count += 1
                        analysis = self.analyser.analyse(word, False)
                        if len(analysis) > 0:
                            analysed_tokens_count += 1
                        else:
                            missed_words.append(word)
        percentage = (analysed_tokens_count / tokens_count) * 100
        time_taken = time.clock() - start
        print('Total words: %d \nAnalysed words: %d \nCoverage: %3.2f %% ' %
              (tokens_count, analysed_tokens_count, percentage))
        print('Time taken: %5.3f seconds' % (time_taken))

        most_common = Counter(missed_words).most_common(250)
        print('Top 250 missed words are:')
        for word in most_common:
            print(word)
示例#6
0
 def __init__(self):
     self.analyser = Analyser()
     self.common_mistakes = read_common_mistakes()
示例#7
0
class SpellChecker(object):
    """
    The SpellChecker class implements a spelling specker based on Malayalam
    morphology analyser(mlmorph)
    """
    def __init__(self):
        self.analyser = Analyser()
        self.common_mistakes = read_common_mistakes()

    def strategies(self):
        return [
            "ChilluNormalization",
            "Ykkuka",
            "NtaCorrection",
            "MpaCorrection",
            "VisualSimilarity",
            "PhoneticSimilarity",
            "GeminateConsonants",
            "ViramaInsertion",
            "VowelElongation",
            "VowelShortening",
            "ChilluToConsonantVirama",
            "ConsonantViramaToChillu",
        ]

    def candidates_from_strategies(self, word: str) -> list:
        """
        Generate possible spelling corrections for the provided word using different strategies

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            list: The list of words that are possible candidates. \
                The list is sorted in descending order of candidate scrores. \
                Best candidates are the first candidates in the list.
        """
        # Order of the items in STRATEGIES is important
        STRATEGIES = self.strategies()

        weighted_suggestions = {}
        for class_name in STRATEGIES:
            strategy = getattr(
                importlib.import_module("mlmorph_spellchecker.strategies"),
                class_name)()
            candidates = Suggestion(strategy).suggest(word)
            for candidate in candidates:
                if candidate in weighted_suggestions:
                    continue
                weighted_analysis = self.analyser.analyse(
                    candidate, True, False)
                if len(weighted_analysis) > 0:
                    weighted_suggestions[candidate] = weighted_analysis[0][1]

        # Sort by the increasing order of weights
        suggestions = sorted(weighted_suggestions.items(), key=lambda t: t[1])
        if len(suggestions) == 0:
            # No suggestions. Try splitting the word after 3rd letter
            for index in range(3, len(word) - 3):
                lWord = word[:index]
                rWord = word[index:]
                lAnalysis = self.analyser.analyse(lWord, False)
                rAnalysis = self.analyser.analyse(rWord, False)
                if len(lAnalysis) > 0 and len(rAnalysis) > 0:
                    suggestions.append([lWord + " " + rWord])
                    break
        # Return the words array
        return [suggestion[0] for suggestion in suggestions]

    def is_known_to_analyser(self, word: str) -> bool:
        """
        Check if the given word is known for the mlmorph analyser

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            boolean: Whether the word is known
        """
        analysis = self.analyser.analyse(word, False, True)
        return len(analysis) > 0

    def is_common_mistake(self, word: str) -> bool:
        """
        Check if the given word is a commonly mistaken word based on our
        database of such words

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            boolean: Whether the word is known
        """

        return word in self.common_mistakes

    def spellcheck(self, word: str) -> bool:
        """
        Spellcheck the given word
        Args:
            word (str): The word to spell check
        Returns:
            boolean: True if words is spelled correctly. False, otherwise.
        """
        common_mistake = self.is_common_mistake(word)
        if common_mistake:
            return False
        return self.is_known_to_analyser(word)

    def candidates(self, word: str) -> list:
        """
        Generate possible spelling corrections for the provided word

        Args:
            word (str): The word for which to calculate candidate spellings
        Returns:
            list: The list of words that are possible candidates. \
                The list is sorted in descending order of candidate scrores.\
                Best candidates are the first candidates in the list
        """
        if self.spellcheck(word):
            # Word is spelled correctly
            return []
        common_mistake = self.is_common_mistake(word)
        if common_mistake:
            return [self.common_mistakes.get(word)]
        return self.candidates_from_strategies(word)
示例#8
0
import regex
from flask import Flask, jsonify, request, send_from_directory

from mlmorph import Generator, Analyser
from mlmorph_spellchecker import SpellChecker

app = Flask(__name__,
            static_folder="./public/",
            static_url_path='',
            template_folder="./public")

generator = Generator()
analyser = Analyser()
spellchecker = SpellChecker()


@app.route("/<path:path>", defaults={'path': 'index.html'})
def index(path):
    return send_from_directory('./public', path)


@app.route("/api/analyse", methods=['POST', 'GET'])
def do_analyse():
    text = None
    analyse_results = {}
    if request.method == 'POST':
        text = request.json.get('text')
    else:
        text = request.args.get('text')
    text = text.strip()
    words = regex.split(r'(\s+)', text)