Пример #1
0
def words_train(in_file, word_file, graph_file):

    # Can only use a exists word_file to make graph_file
    if in_file is not None:
        #Auto create words from in_file
        file_size = os.path.getsize(in_file)
        word_dict = None
        if file_size > 3*1024*1024:
            # A little more quick but inaccurate than WordDict1
            word_dict = WordDict2()
            print >> sys.stderr, 'please wait, getting words from file', in_file
        else:
            word_dict = WordDict1()
        with codecs.open(in_file, 'r', 'utf-8') as file:
            for line in file:
                for sentence in re_line.split(line):
                    word_dict.learn(sentence)
        word_dict.learn_flush()
        print >> sys.stderr, 'get all words, save word to file', word_file
        
        word_dict.save_to_file(word_file)
        print >> sys.stderr, 'save all words completely, create word graphp', graph_file

    words = []
    with codecs.open(word_file,'r','utf-8') as file:
        for line in file:
            tokens = line.split(" ")
            if len(tokens) >= 2:
                words.append(tokens[0].strip())
    words = sorted(words)

    whoosh_spelling.wordlist_to_graph_file(words, graph_file)
    print >> sys.stderr, 'words_train ok'
Пример #2
0
def test_find_self():
    wordlist = sorted(u("book bake bike bone").split())
    st = RamStorage()
    f = st.create_file("test")
    spelling.wordlist_to_graph_file(wordlist, f)

    gr = fst.GraphReader(st.open_file("test"))
    gc = spelling.GraphCorrector(gr)
    assert gc.suggest("book")[0] != "book"
    assert gc.suggest("bake")[0] != "bake"
    assert gc.suggest("bike")[0] != "bike"
    assert gc.suggest("bone")[0] != "bone"
Пример #3
0
def test_find_self():
    wordlist = sorted(u("book bake bike bone").split())
    st = RamStorage()
    f = st.create_file("test")
    spelling.wordlist_to_graph_file(wordlist, f)

    gr = fst.GraphReader(st.open_file("test"))
    gc = spelling.GraphCorrector(gr)
    assert gc.suggest("book")[0] != "book"
    assert gc.suggest("bake")[0] != "bake"
    assert gc.suggest("bike")[0] != "bike"
    assert gc.suggest("bone")[0] != "bone"
Пример #4
0
def words_to_corrector(words):
    st = RamStorage()
    f = st.create_file("test")
    spelling.wordlist_to_graph_file(words, f)
    f = st.open_file("test")
    return spelling.GraphCorrector(fst.GraphReader(f))
Пример #5
0
from whoosh import spelling
from whoosh.filedb.filestore import FileStorage

wordfile = open("/Users/amckenzie/Documents/data/scowl-7.1/final/english-words.60")

# Use a Storage object to get a file to write the graph into 
st = FileStorage("/Users/amckenzie/Tools/pythonScripts") 
f = st.create_file("wordgraph") 

# Write a graph of the words into the file 
spelling.wordlist_to_graph_file(wordfile, f) 

# Create a graph reader from the file and wrap it with a corrector 
f = st.open_file("wordgraph") 
gr = fst.GraphReader(f) 
cor = spelling.GraphCorrector(gr) 

# See docs for whoosh.spelling.Corrector.suggest() 
cor.suggest("aple")
Пример #6
0
def words_to_corrector(words):
    st = RamStorage()
    f = st.create_file("test")
    spelling.wordlist_to_graph_file(words, f)
    f = st.open_file("test")
    return spelling.GraphCorrector(fst.GraphReader(f))
def _create_word_graph_file(name, file_storage, word_set):
    """Create a word graph file and open it in memory."""
    word_graph_file = file_storage.create_file(name)
    spelling.wordlist_to_graph_file(sorted(list(word_set)),
                                    word_graph_file)
    return copy_to_ram(file_storage).open_file(name)