예제 #1
0
    def test_hunspell_compatibility( self ):
        """ test_hunspell_compatibility """
        from pyspell import speller
        s = speller(
            self.aff_file(),
            self.dic_file(),
        )
        s.init()

        bad = set()
        with codecs.open(
            self.file("hunspell_results/wiki-words.txt"),
            mode="r+",
            encoding="utf-8"
        ) as fin_words:
            for word in fin_words:
                word = word.strip()
                accepted = s.check(word)
                if accepted is None:
                    bad.add(word)
        bad_expected = set()
        with codecs.open(
            self.file("hunspell_results/wiki-words.txt.bad.results"),
            mode="r+",
            encoding="utf-8"
        ) as fin_results:
            for word in fin_results:
                bad_expected.add(word.strip())
        diff1 = bad_expected - bad
        print diff1
        self.assertTrue(0 == len(diff1))
        diff2 = bad - bad_expected
        print diff2
        self.assertTrue(0 == len(diff2))
예제 #2
0
    def test_hunspell_compatibility(self):
        """ test_hunspell_compatibility """
        from pyspell import speller
        s = speller(
            self.aff_file(),
            self.dic_file(),
        )
        s.init()

        bad = set()
        with codecs.open(self.file("hunspell_results/wiki-words.txt"),
                         mode="r+",
                         encoding="utf-8") as fin_words:
            for word in fin_words:
                word = word.strip()
                accepted = s.check(word)
                if accepted is None:
                    bad.add(word)
        bad_expected = set()
        with codecs.open(
                self.file("hunspell_results/wiki-words.txt.bad.results"),
                mode="r+",
                encoding="utf-8") as fin_results:
            for word in fin_results:
                bad_expected.add(word.strip())
        diff1 = bad_expected - bad
        print diff1
        self.assertTrue(0 == len(diff1))
        diff2 = bad - bad_expected
        print diff2
        self.assertTrue(0 == len(diff2))
예제 #3
0
def morpho_parse(env):
    """
        Gather most used words according to a specific definition
        Note: not tested with larger wikis!
    """
    import glob
    from simplemorpho import morpho, word_forms

    input_glob = os.path.join(
            env["start_dir"],
            env["input"]["dir"],
            env["input"]["morpho_glob"]
    )
    for f in glob.glob(input_glob):
        _logger.info(u"Working on [%s]", f)
        m = morpho(f)
        max_show = 0
        m.parse(all_forms=True, max_process=max_show)

        ##
        if True:
            sys.path.insert(0, os.path.join(env["start_dir"], env["src_dir"]))
            dictionaries = env["input"]["dictionaries"]
            aff_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".aff")
            dic_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".dic")
            from pyspell import speller
            s = speller(aff_file, dic_file)
            s.init()
            pos = 0
            for k in s._dic._d.keys():
                if k.lower() not in m.all_forms():
                    _logger.info(u"Word from .dic not found in ma [%s]", k)
                    pos += 1
            print "Not found words [%d out of %d]" % (pos, len(s._dic._d))



        ##
        uniq_rules = set()
        uniq_rules_right = set()
        for pos, (k, v) in enumerate(m.forms().iteritems()):
            if 0 < max_show < pos:
                break
            #print u"%s: %s" % (k, u",".join(v.forms()))
            #print v.rules()
            r_strs = word_forms.rule_strs(v.rules())
            uniq_rules |= set(r_strs)
            uniq_rules_right |= set([x.split("->")[0] for x in r_strs])

        msg = "All rules [%d], unique rules [%d], unique rules right [%d]" % (
            pos, len(uniq_rules), len(uniq_rules_right)
        )
        print msg
        for pos, r in enumerate(sorted(uniq_rules)):
            if 1000 < pos:
                break
            print r
        print msg
예제 #4
0
    def test_arbitrary_affix(self):
        """ test_arbitrary_affix """
        from pyspell import speller
        s = speller(
            self.aff_file("small"),
            self.dic_file("small"),
        )
        s.init()

        for w, flag, expected in (
            ("ammm", False, False),
            ("am", False, False),
        ):
            self.assertEqual(expected, s.check(w, flag) is not None)
예제 #5
0
    def test_arbitrary_affix( self ):
        """ test_arbitrary_affix """
        from pyspell import speller
        s = speller(
            self.aff_file("small"),
            self.dic_file("small"),
        )
        s.init()

        for w, flag, expected in (
                ("ammm", False, False),
                ("am", False, False),
        ):
            self.assertEqual(expected, s.check(w, flag) is not None)
예제 #6
0
    def test_text( self ):
        """ test_text """
        from pyspell import speller
        s = speller(
            self.aff_file("mini"),
            self.dic_file("mini"),
        )
        s.init()

        with codecs.open(self.text_file("mini"), mode="r+", encoding="utf-8") as fin:
            guru_accepted = None
            for l in fin:
                for w in l.split():
                    self.log(u"Testing [%s]" % w)
                    accepted = s.check(w)
                    if guru_accepted is None:
                        guru_accepted = accepted
                    self.assertIsNotNone(accepted)
                    self.log(u"+-accepted [%s]" % accepted)
                    self.assertEqual(accepted, guru_accepted)
예제 #7
0
    def test_spell_origin(self):
        """ test_spell_origin """
        from pyspell import speller
        from pyspell._utils import line_strip
        s = speller(
            self.aff_file(),
            self.dic_file(),
        )
        s.init()

        with codecs.open(self.dic_file(), mode="r+", encoding="utf-8") as fin:
            fin.next()
            for i, l in enumerate(fin):
                w = line_strip(l).split("/")[0]
                # errors in dict
                if " " in w:
                    continue
                self.assertTrue(s.check(w))
                self.assertFalse(s.check(w + "ehmmm"))
                if 0 == (i + 1) % 10000:
                    self.log("done [%d]" % i)
예제 #8
0
    def test_spell_origin( self ):
        """ test_spell_origin """
        from pyspell import speller
        from pyspell._utils import line_strip
        s = speller(
            self.aff_file(),
            self.dic_file(),
        )
        s.init()

        with codecs.open(self.dic_file(), mode="r+", encoding="utf-8") as fin:
            fin.next()
            for i, l in enumerate(fin):
                w = line_strip(l).split("/")[0]
                # errors in dict
                if " " in w:
                    continue
                self.assertTrue(s.check(w))
                self.assertFalse(s.check(w + "ehmmm"))
                if 0 == (i + 1) % 10000:
                    self.log("done [%d]" % i)
예제 #9
0
    def test_text(self):
        """ test_text """
        from pyspell import speller
        s = speller(
            self.aff_file("mini"),
            self.dic_file("mini"),
        )
        s.init()

        with codecs.open(self.text_file("mini"), mode="r+",
                         encoding="utf-8") as fin:
            guru_accepted = None
            for l in fin:
                for w in l.split():
                    self.log(u"Testing [%s]" % w)
                    accepted = s.check(w)
                    if guru_accepted is None:
                        guru_accepted = accepted
                    self.assertIsNotNone(accepted)
                    self.log(u"+-accepted [%s]" % accepted)
                    self.assertEqual(accepted, guru_accepted)
예제 #10
0
    def test_ignorecase_text(self):
        """ test_ignorecase_text """
        from pyspell import speller
        s = speller(
            self.aff_file("small"),
            self.dic_file("small"),
        )
        s.init()

        for w, flag, expected in (
            (u"Abcházska", True, True),
            (u"abcházska", True, True),
            (u"Abcházsko", True, True),
            (u"abcházsko", True, True),
            ("Bratislava", False, True),
            ("Bratislave", False, True),
            ("Bratislavy", False, True),
            ("bratislava", False, False),
            ("bratislave", False, False),
            ("bratislavy", False, False),
        ):
            self.assertEqual(expected, s.check(w, flag) is not None)
예제 #11
0
    def test_ignorecase_text( self ):
        """ test_ignorecase_text """
        from pyspell import speller
        s = speller(
            self.aff_file("small"),
            self.dic_file("small"),
        )
        s.init()

        for w, flag, expected in (
                (u"Abcházska", True, True),
                (u"abcházska", True, True),
                (u"Abcházsko", True, True),
                (u"abcházsko", True, True),

                ("Bratislava", False, True),
                ("Bratislave", False, True),
                ("Bratislavy", False, True),
                ("bratislava", False, False),
                ("bratislave", False, False),
                ("bratislavy", False, False),
        ):
            self.assertEqual(expected, s.check(w, flag) is not None)
예제 #12
0
파일: do.py 프로젝트: amitdo/pyspell
def unknown_from_wiki(env):
    """
        How many words do we know from a list of most used ones?
    """
    sys.path.insert(0, os.path.join(env["start_dir"], env["src_dir"]))

    def _progress(cnt, cnt_nf, cnt_nf_f_cap, time_arr):
        time_arr.append(time.time())
        return "in [%.2fs] .. done [%8d] words ... [%5d][%.2f%%] not found ... " \
               "[%5d][%.2f%%] not found lower" % (
                   (time_arr[-1] - time_arr[-2]),
                   cnt,
                   cnt_nf, (100. * cnt_nf / cnt),
                   cnt_nf - cnt_nf_f_cap, (100. * (cnt_nf - cnt_nf_f_cap) / cnt)
               )

    dictionaries = env["input"]["dictionaries"]
    aff_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".aff")
    dic_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".dic")
    wiki_words_input = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"])
    log_every_n = env["log_every_n"]
    wiki_not_found_output = os.path.join(env["start_dir"], env["temp"]["dir"], env["temp"]["wiki_not_found"])

    if not os.path.exists(aff_file):
        raise Exception("AFF file not found [%s]" % aff_file)
    if not os.path.exists(dic_file):
        raise Exception("DIC file not found [%s]" % dic_file)
    if not os.path.exists(wiki_words_input):
        raise Exception("Wiki words input not found [%s]" % wiki_words_input)

    from pyspell import speller
    s = speller(aff_file, dic_file)
    s.init()

    ignorecase = False

    pos = 0
    not_found = 0
    not_found_first_cap = 0
    time_arr = [time.time()]

    _logger.info("Checking words...")
    with codecs.open(wiki_not_found_output, mode="w+", encoding="utf-8") as fout:
        with codecs.open(wiki_words_input, mode="r+", encoding="utf-8") as fin:
            not_found_arr = []
            for l in fin:
                pos += 1
                l = l.strip()
                ret = s.check(l, ignorecase)
                if ret is None:
                    not_found += 1
                    if l[0].isupper():
                        not_found_first_cap += 1
                    # _logger.info(u"Not found: [%s]", l)
                    not_found_arr.append(l)
                    for i in range(100):
                        if 10000 < len(not_found_arr):
                            # similar to u'\n'.join()
                            fout.writelines(not_found_arr)
                        not_found_arr = []
                if 0 == pos % log_every_n:
                    _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr))
            fout.writelines(not_found_arr)
    _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr))
예제 #13
0
# -*- coding: utf-8 -*-
# author: jm
import codecs
import test

from pyspell import speller
from pyspell._utils import line_strip


if __name__ == "__main__":
    aff_file = test.files.aff_file()[0]
    dic_file = test.files.dic_file()[0]

    s = speller( aff_file, dic_file )
    s.init()
    LOOP = 5

    with codecs.open(dic_file, mode="r+", encoding="utf-8") as fin:
        fin.next()
        for i, l in enumerate(fin):
            w = line_strip(l).split("/")[0]
            # errors in dict
            if " " in w:
                continue
            for i in range(LOOP):
                s.check(w)
                s.check(w + "ehmmm")
            if 0 == (i + 1) % 10000:
                print "done [%d]" % i

예제 #14
0
def unknown_from_wiki(env):
    """
        How many words do we know from a list of most used ones?
    """
    sys.path.insert(0, os.path.join(env["start_dir"], env["src_dir"]))

    def _progress(cnt, cnt_nf, cnt_nf_f_cap, time_arr):
        time_arr.append(time.time())
        return "in [%.2fs] .. done [%8d] words ... [%5d][%.2f%%] not found ... " \
               "[%5d][%.2f%%] not found lower" % (
                   (time_arr[-1] - time_arr[-2]),
                   cnt,
                   cnt_nf, (100. * cnt_nf / cnt),
                   cnt_nf - cnt_nf_f_cap, (100. * (cnt_nf - cnt_nf_f_cap) / cnt)
               )

    dictionaries = env["input"]["dictionaries"]
    aff_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".aff")
    dic_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".dic")
    wiki_words_input = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"])
    log_every_n = env["log_every_n"]
    wiki_not_found_output = os.path.join(env["start_dir"], env["temp"]["dir"], env["temp"]["wiki_not_found"])

    if not os.path.exists(aff_file):
        raise Exception("AFF file not found [%s]" % aff_file)
    if not os.path.exists(dic_file):
        raise Exception("DIC file not found [%s]" % dic_file)
    if not os.path.exists(wiki_words_input):
        raise Exception("Wiki words input not found [%s]" % wiki_words_input)

    from pyspell import speller
    s = speller(aff_file, dic_file)
    s.init()

    ignorecase = False

    pos = 0
    not_found = 0
    not_found_first_cap = 0
    time_arr = [time.time()]

    _logger.info("Checking words...")
    with codecs.open(wiki_not_found_output, mode="w+", encoding="utf-8") as fout:
        with codecs.open(wiki_words_input, mode="r+", encoding="utf-8") as fin:
            not_found_arr = []
            for l in fin:
                pos += 1
                l = l.strip()
                ret = s.check(l, ignorecase)
                if ret is None:
                    not_found += 1
                    if l[0].isupper():
                        not_found_first_cap += 1
                    # _logger.info(u"Not found: [%s]", l)
                    not_found_arr.append(l)
                    for i in range(100):
                        if 10000 < len(not_found_arr):
                            # similar to u'\n'.join()
                            fout.writelines(not_found_arr)
                        not_found_arr = []
                if 0 == pos % log_every_n:
                    _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr))
            fout.writelines(not_found_arr)
    _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr))
예제 #15
0
 def test_inspect(self):
     """ test_inspect """
     from pyspell import speller
     s = speller(self.aff_file(), self.dic_file())
     s.init()
     s.inspect(lambda x: self.log(x))
예제 #16
0
# -*- coding: utf-8 -*-
# author: jm
import codecs
import test

from pyspell import speller
from pyspell._utils import line_strip

if __name__ == "__main__":
    aff_file = test.files.aff_file()[0]
    dic_file = test.files.dic_file()[0]

    s = speller(aff_file, dic_file)
    s.init()
    LOOP = 5

    with codecs.open(dic_file, mode="r+", encoding="utf-8") as fin:
        fin.next()
        for i, l in enumerate(fin):
            w = line_strip(l).split("/")[0]
            # errors in dict
            if " " in w:
                continue
            for i in range(LOOP):
                s.check(w)
                s.check(w + "ehmmm")
            if 0 == (i + 1) % 10000:
                print "done [%d]" % i
예제 #17
0
 def test_inspect( self ):
     """ test_inspect """
     from pyspell import speller
     s = speller(self.aff_file(), self.dic_file())
     s.init()
     s.inspect(lambda x: self.log(x))