示例#1
0
def find_missing_dict(lang):
    missing = []
    try:
        enchant.request_dict(lang['aspell'])
    except:
        missing.append(('Dictionary', '(none)', {
            'debian': ('aspell-%s' % lang['aspell']),
            'fedora': ('aspell-%s' % lang['aspell']),
            'gentoo': ('aspell-%s' % lang['aspell']),
            'linuxmint': ('aspell-%s' % lang['aspell']),
            'ubuntu': ('aspell-%s' % lang['aspell']),
        }))
    return missing
示例#2
0
文件: spell.py 项目: prologic/kdb
    def init(self, *args, **kwargs):
        super(Spell, self).init(*args, **kwargs)

        self.language = DEFAULT_LANGUAGE
        self.dictionary = request_dict(self.language)

        Commands().register(self)
示例#3
0
def find_missing_dict(lang):
    if os.name == "nt":
        return []
    import enchant
    missing = []
    try:
        enchant.request_dict(lang['aspell'])
    except:  # noqa: E722
        missing.append(('Dictionary', '(none)', {
            'debian': ('aspell-%s' % lang['aspell']),
            'fedora': ('aspell-%s' % lang['aspell']),
            'gentoo': ('aspell-%s' % lang['aspell']),
            'linuxmint': ('aspell-%s' % lang['aspell']),
            'ubuntu': ('aspell-%s' % lang['aspell']),
        }))
    return missing
示例#4
0
文件: util.py 项目: rzr/paperwork
def check_spelling(spelling_lang, txt):
    """
    Check the spelling in the text, and compute a score. The score is the
    number of words correctly (or almost correctly) spelled, minus the number
    of mispelled words. Words "almost" correct remains neutral (-> are not
    included in the score)

    Returns:
        A tuple : (fixed text, score)
    """
    _ENCHANT_LOCK.acquire()
    try:
        # Maximum distance from the first suggestion from python-enchant

        words_dict = enchant.request_dict(spelling_lang)
        try:
            tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
        except enchant.tokenize.TokenizerNotFoundError:
            # Fall back to default tokenization if no match for 'lang'
            tknzr = enchant.tokenize.get_tokenizer()

        score = 0
        offset = 0
        for (word, word_pos) in tknzr(txt):
            if len(word) < _MIN_WORD_LEN:
                continue
            if words_dict.check(word):
                # immediately correct words are a really good hint for
                # orientation
                score += 100
                continue
            suggestions = words_dict.suggest(word)
            if len(suggestions) <= 0:
                # this word is useless. It may even indicates a bad orientation
                score -= 10
                continue
            main_suggestion = suggestions[0]
            lv_dist = nltk.metrics.distance.edit_distance(word, main_suggestion)
            if lv_dist > _MAX_LEVENSHTEIN_DISTANCE:
                # hm, this word looks like it's in a bad shape
                continue

            logging.debug("Spell checking: Replacing: %s -> %s"
                   % (word, main_suggestion))

            # let's replace the word by its suggestion

            pre_txt = txt[:word_pos + offset]
            post_txt = txt[word_pos + len(word) + offset:]
            txt = pre_txt + main_suggestion + post_txt
            offset += (len(main_suggestion) - len(word))

            # fixed words may be a good hint for orientation
            score += 5

        return (txt, score)
    finally:
        _ENCHANT_LOCK.release()
示例#5
0
def check_spelling(spelling_lang, txt):
    """
    Check the spelling in the text, and compute a score. The score is the
    number of words correctly (or almost correctly) spelled, minus the number
    of mispelled words. Words "almost" correct remains neutral (-> are not
    included in the score)

    Returns:
        A tuple : (fixed text, score)
    """
    if os.name == "nt":
        assert (not "check_spelling() not available on Windows")
        return
    with _ENCHANT_LOCK:
        # Maximum distance from the first suggestion from python-enchant

        words_dict = enchant.request_dict(spelling_lang)
        try:
            tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
        except enchant.tokenize.TokenizerNotFoundError:
            # Fall back to default tokenization if no match for 'lang'
            tknzr = enchant.tokenize.get_tokenizer()

        score = 0
        offset = 0
        for (word, word_pos) in tknzr(txt):
            if len(word) < _MIN_WORD_LEN:
                continue
            if words_dict.check(word):
                # immediately correct words are a really good hint for
                # orientation
                score += 100
                continue
            suggestions = words_dict.suggest(word)
            if (len(suggestions) <= 0):
                # this word is useless. It may even indicates a bad orientation
                score -= 10
                continue
            main_suggestion = suggestions[0]
            lv_dist = Levenshtein.distance(word, main_suggestion)
            if (lv_dist > _MAX_LEVENSHTEIN_DISTANCE):
                # hm, this word looks like it's in a bad shape
                continue

            logger.debug("Spell checking: Replacing: %s -> %s" %
                         (word, main_suggestion))

            # let's replace the word by its suggestion

            pre_txt = txt[:word_pos + offset]
            post_txt = txt[word_pos + len(word) + offset:]
            txt = pre_txt + main_suggestion + post_txt
            offset += (len(main_suggestion) - len(word))

            # fixed words may be a good hint for orientation
            score += 5

        return (txt, score)
示例#6
0
文件: util.py 项目: chrisz/paperwork
def check_spelling(ocr_lang, txt):
    """
    Check the spelling in the text, and compute a score. The score is the
    number of words correctly (or almost correctly) spelled, minus the number of
    mispelled words. Words "almost" correct remains neutral (-> are not included
    in the score)

    Returns:
        A tuple : (fixed text, score)
    """
    # Maximum distance from the first suggestion from python-enchant
    MAX_LEVENSHTEIN_DISTANCE = 1
    MIN_WORD_LEN = 4

    # TODO(Jflesch): We are assuming here that we can figure out the best
    # dictionary based on the 3 letters OCR lang. This is a bad assumption
    try:
        language = pycountry.languages.get(terminology=ocr_lang[:3])
    except KeyError:
        language = pycountry.languages.get(bibliographic=ocr_lang[:3])
    spelling_lang = language.alpha2

    words_dict = enchant.request_dict(spelling_lang)
    try:
        tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
    except enchant.tokenize.TokenizerNotFoundError:
        # Fall back to default tokenization if no match for 'lang'
        tknzr = enchant.tokenize.get_tokenizer()

    score = 0
    offset = 0
    for (word, word_pos) in tknzr(txt):
        if words_dict.check(word):
            score += 1
            continue
        if len(word) < MIN_WORD_LEN:
            continue
        suggestions = words_dict.suggest(word)
        if (len(suggestions) <= 0):
            score -= 1
            continue
        main_suggestion = suggestions[0]
        lv_dist = Levenshtein.distance(word, main_suggestion)
        if (lv_dist > MAX_LEVENSHTEIN_DISTANCE):
            continue

        print "Spell checking: Replacing: %s -> %s" % (word, main_suggestion)

        # let's replace the word by its suggestion

        pre_txt = txt[:word_pos + offset]
        post_txt = txt[word_pos + len(word) + offset:]
        txt = pre_txt + main_suggestion + post_txt
        offset += (len(main_suggestion) - len(word))

    return (txt, score)
示例#7
0
文件: deps.py 项目: jflesch/paperwork
def find_missing_dict(lang):
    if os.name == "nt":
        return []
    import enchant
    missing = []
    try:
        enchant.request_dict(lang['aspell'])
    except:  # noqa: E722
        missing.append(
            (
                'Dictionary', '(none)',
                {
                    'debian': ('aspell-%s' % lang['aspell']),
                    'fedora': ('aspell-%s' % lang['aspell']),
                    'gentoo': ('aspell-%s' % lang['aspell']),
                    'linuxmint': ('aspell-%s' % lang['aspell']),
                    'ubuntu': ('aspell-%s' % lang['aspell']),
                }
            )
        )
    return missing
def correct_spelling(word, lang = LANG_EN):
    """
    :param word: the word requiring correction
    :param lang: language of the word
    :return: the most probable correct version of the input word
    """
    import enchant
    d = enchant.request_dict(lang)
    if not d.check(word):
        return d.suggest(word)[0]

    return word
示例#9
0
def runThruDictionary(words):
    # check against en_US spelling
    endict = enchant.request_dict("en_US")
    dictWords = []
    for w in words:
        if endict.check(w):
            dictWords.append(w)
        else:
            suggestions = endict.suggest(w)
            # just append the first one
            if len(suggestions) > 0:
                dictWords.append(suggestions[0])
            # else we have unrecoverable garbage?                
    return dictWords
    def spell_check(self, post, lang):
        """ Check spellings for the given post and given language. """

        try:
            dictionary = enchant.request_dict(lang)
            checker = SpellChecker(lang, filters=[EmailFilter, URLFilter])
            checker.set_text(post.text(lang=lang, strip_html=True))
            words = [error.word for error in checker]
            words = [
                word for word in words if not dictionary.check(word)
            ]
            LOGGER.notice(
                'Mis-spelt words in %s: %s' % (
                    post.fragment_deps(lang), ', '.join(words)
                )
            )

        except enchant.DictNotFoundError:
            LOGGER.notice('No dictionary found for %s' % lang)
示例#11
0
def correct_words(word_list):
    """Takes a list of strings and tries to correct them so that they are valid words (i.e. alphabetical).
    Correction Procedure: (1) make lowercase -> (2) delete string if not alphabetical (allowed to contain an
    apostrophe or punctuation at the end) -> (3) remove trailing punctuation -> (4) break up contractions,
    split and take original word -> (5) remove possessive 's -> (6) check if word is in dictionary, if not,
    attempt to correct it to most likely correct word, and if none can be found, delete it.
    Returns list of corrected words"""
    english_dict = enchant.request_dict("en_US")  # open up the english dictionary
    index = 0
    while index <= len(word_list) - 1:
        # make lowercase
        word_list[index] = word_list[index].lower()
        # remove values if they aren't words (alphabetical, can end with punctuation)
        if not re.match('^[a-z]+[?,;.!]*$', word_list[index]):
            del word_list[index]
            index -= 1
        # it is a word - clean it up.
        else:
            # remove punctuation if it appears at the end of a word.
            word_list[index] = word_list[index].rstrip(string.punctuation)
            # break up contractions using contraction library (chooses most likely conversion - can be mistaken
            word_list[index] = contractions.fix(word_list[index])
            # split and remove contraction
            word_list[index] = word_list[index].split()[0]
            # remove "'s" at the ends of words (contraction library doesn't remove possessive 's)
            word_list[index] = re.sub("\'s$", '', word_list[index])
            # spell check words
            if not english_dict.check(word_list[index]):
                suggestions = english_dict.suggest(word_list[index])
                if len(suggestions) > 0:
                    word_list[index] = suggestions[0].lower().split()[0]
                else:
                    del word_list[index]
                    index -= 1
        index += 1
        for item in word_list:
            if not item.isalpha():
                word_list.remove(item)
    return word_list
    def __init__(self, document, parent=None):
        super(Spellcheck, self).__init__(parent)

        self.createUI()
        if document is None:
            return
        else:
            self.doc = document.toPlainText()
        # copy the document text and strip out HTML, URL's and Email addresses
        tokens = get_tokenizer("en_US", chunkers=(HTMLChunker,), filters=[EmailFilter, URLFilter])
        self.editDoc = [] # tuples go into this list
        for word in tokens(self.doc):
            self.editDoc.append(word)
        self.wordsToCheck = dict((t[0], i) for i, t in enumerate(self.editDoc))
        # >>> Output self.wordsToCheck , unit Test with 10 cases
        self.wordlist = enchant.request_dict("en_GB")
        self.misspeltList = []
        for key in self.wordsToCheck.keys():
            self.checkWord(key)
        # >>> Plonk a test here
        
        self.highlightMisspelt(self.misspeltList[Spellcheck.index:])
示例#13
0
import pygame
#from sys import argv
from gtts import gTTS
from enchant import request_dict
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract
#from tesseract import image_to_string
sentence = pytesseract.image_to_string(Image.open('saying.jpg'))
gb_dict = request_dict('en_gb')
us_dict = request_dict('en_us')
new = ''.join(sentence)
sen = new.split()

for word in sen:
    if gb_dict.check(word) or us_dict.check(word):
        print word, " "
        tts = gTTS(text=word, lang='en')
        tts.save("result.mp3")
        file = 'result.mp3'
        pygame.init()
        pygame.mixer.init()
        pygame.mixer.music.load(file)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)

    else:
        print  " %s " % \
              sep=',',
              encoding='utf-8')
    print df
    #Summarizion
    text = ''
    for sent in df['original_sents'].values:
        text += '.' + sent

    summarize = Summarization(text, None, senti)
    final_summary1, final_summary2, final_summary3, neg_final_summary1, neg_final_summary2, neg_final_summary3, counts, eigen_explo = summarize.get_summaries(
    )
    print final_summary1, final_summary2, final_summary3, neg_final_summary1, neg_final_summary2, neg_final_summary3, counts, eigen_explo


if __name__ == "__main__":
    d = spell.request_dict("en_US")
    nlp = spacy.load('en')

    senti = SentimentAnalysis()

    start_time = time.time()
    filenames = [
        f for f in listdir(input_entity_files)
        if isfile(join(input_entity_files, f))
    ]
    #Parellel on CPU cores
    #Parallel(n_jobs=cpu_count() - 1, verbose=10, backend="multiprocessing", batch_size="auto")(delayed(processFiles)(fileName,input_entity_files) for fileName in filenames)
    #for (dirpath, dirnames, filenames) in walk(input_entity_files):
    for file in filenames:
        processFiles(file, input_entity_files)
示例#15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import enchant
d = enchant.request_dict("fr_FR")
#x= "en organisation le savent — lk est d'une extrême banalité. Des person fait bonjour hello"
x = "eu l’occasion d‘évo"

#1.deliminate the ordinair errors, there are two kinds of ord err: 1,we have to delete it 2,we have to replace it
# for the first kind we delete them first
#2.for the second kind of err, we can correct them before check it by enchant!
#2.check the word
#4.replace the the word to abbreviation

#this the first kind of err, we have to delete them first
error_ens = {''}

file = open("error_1")
while 1:
    line = file.readline()
    if str(line) != '':
        error_ens.add(str(line).strip())
    if not line:
        break

replace_dic = {'bonjour': 'bg'}
file = open("replace")
while 1:
    line1 = file.readline()
    if not line1:
        break
    if str(line1).strip() != '':
示例#16
0
import enchant
import numpy as np
import sys
from nltk import word_tokenize, pos_tag, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models, similarities
from collections import Counter
from numpy import genfromtxt

questions = []
vocabQuesCount = Counter()

tokenizer = RegexpTokenizer(r'\w+')
stop = stopwords.words('english')
dictionary = enchant.request_dict("en_US")
stemmer = PorterStemmer()


def clean_ques(ques):
    ques = ques.lower()
    ques = tokenizer.tokenize(ques)
    # for i in range(len(ques)):
    # 	if not enchant.dict_exists(ques[i]):
    # 		ques[i] = dictionary.suggest(ques[i])[0]
    ques = [stemmer.stem(q) for q in ques]
    return ques


with open('../data/clean.json') as data_file:
    data = json.load(data_file)
示例#17
0
import csv
import matplotlib.pyplot as plt
import enchant

enchant.request_dict("en_US")                        # get the dictionary

path_male = 'ANEW/male.csv'
path_female = 'ANEW/female.csv'
male_words = []
male_valence_mean = []
male_valence_std = []
male_arousal_mean = []
male_arousal_std = []
female_words = []
female_valence_mean = []
female_valence_std = []
female_arousal_mean = []
female_arousal_std = []

def create_data():
    """
    Adds all the data from csv files to the respective lists
    """
    global male_words
    with open(path_male) as f:
        mreader = csv.reader(f)
        for row in mreader:
            male_words.append(row[0])
            male_valence_mean.append(float(row[1]))
            male_valence_std.append(float(row[2]))
            male_arousal_mean.append(float(row[3]))
示例#18
0
def check_spelling(ocr_lang, txt):
    """
    Check the spelling in the text, and compute a score. The score is the
    number of words correctly (or almost correctly) spelled, minus the number of
    mispelled words. Words "almost" correct remains neutral (-> are not included
    in the score)

    Returns:
        A tuple : (fixed text, score)
    """
    global _ENCHANT_LOCK

    _ENCHANT_LOCK.acquire()
    try:
        # Maximum distance from the first suggestion from python-enchant
        MAX_LEVENSHTEIN_DISTANCE = 1
        MIN_WORD_LEN = 4

        # TODO(Jflesch): We are assuming here that we can figure out the best
        # dictionary based on the 3 letters OCR lang. This is a bad assumption
        try:
            language = pycountry.languages.get(terminology=ocr_lang[:3])
        except KeyError:
            language = pycountry.languages.get(bibliographic=ocr_lang[:3])
        spelling_lang = language.alpha2

        words_dict = enchant.request_dict(spelling_lang)
        try:
            tknzr = enchant.tokenize.get_tokenizer(spelling_lang)
        except enchant.tokenize.TokenizerNotFoundError:
            # Fall back to default tokenization if no match for 'lang'
            tknzr = enchant.tokenize.get_tokenizer()

        score = 0
        offset = 0
        for (word, word_pos) in tknzr(txt):
            if len(word) < MIN_WORD_LEN:
                continue
            if words_dict.check(word):
                # immediately correct words are a really good hint for orientation
                score += 100
                continue
            suggestions = words_dict.suggest(word)
            if (len(suggestions) <= 0):
                # this word is useless. It may even indicates a bad orientation
                score -= 10
                continue
            main_suggestion = suggestions[0]
            lv_dist = Levenshtein.distance(word, main_suggestion)
            if (lv_dist > MAX_LEVENSHTEIN_DISTANCE):
                # hm, this word looks like it's in a bad shape
                continue

            print "Spell checking: Replacing: %s -> %s" % (word,
                                                           main_suggestion)

            # let's replace the word by its suggestion

            pre_txt = txt[:word_pos + offset]
            post_txt = txt[word_pos + len(word) + offset:]
            txt = pre_txt + main_suggestion + post_txt
            offset += (len(main_suggestion) - len(word))

            # fixed words may be a good hint for orientation
            score += 5

        return (txt, score)
    finally:
        _ENCHANT_LOCK.release()
示例#19
0
import pygame
#from sys import argv
from gtts import gTTS
from enchant import request_dict
try:
	import Image
except ImportError:
	from PIL import Image
import pytesseract
#from tesseract import image_to_string
sentence=pytesseract.image_to_string(Image.open('saying.jpg'))
gb_dict = request_dict('en_gb')
us_dict = request_dict('en_us')
new=''.join(sentence)
sen=new.split()

for word in sen:
     if gb_dict.check(word) or us_dict.check(word):
        print word, " "
	tts=gTTS(text=word,lang='en')
	tts.save("result.mp3")
	file='result.mp3'
	pygame.init()
	pygame.mixer.init()
	pygame.mixer.music.load(file)
	pygame.mixer.music.play()
	while pygame.mixer.music.get_busy():
    		pygame.time.Clock().tick(10)

     else:
        print  " %s " % \
示例#20
0
import string
import collections
import copy

import enchant

from utils import *
from constants import *
from letter import Letter

enchant = enchant.request_dict("en_US")

alphabet = list(string.ascii_lowercase)

transDict = {'a': "_", 'b': "_", 'c': "_", 'd': "_", 'e': "_", 'f': "_", 'g': "_", 'h': "_",
             'i': "_", 'j': "_", 'k': "_", 'l': "_", 'm': "_", 'n': "_", 'o': "_", 'p': "_",
             'q': "_", 'r': "_", 's': "_", 't': "_", 'u': "_", 'v': "_", 'w': "_", 'x': "_",
             'y': "_", 'z': "_", " ": " "}

transDict_de = {'a': "_", 'b': "_", 'c': "_", 'd': "_", 'e': "_", 'f': "_", 'g': "_", 'h': "_", 'i': "_",
             'j': "_", 'k': "_", 'l': "_", 'm': "_", 'n': "_", 'o': "_", 'p': "_", 'q': "_", 'r': "_",
             's': "_", 't': "_", 'u': "_", 'v': "_", 'w': "_", 'x': "_", 'y': "_", 'z': "_", 'ä': "_",
             'ö': "_", 'ü': "_", 'ß': "_", " ": " "}


def decrypt(cipher, dictionary):
    word_text = ''

    for character in cipher:
        if character in dictionary:
            word_text += dictionary[character]
示例#21
0
 def fetchSuggestion(keyword):
     d = enchant.request_dict("en_US")
     return map(lambda w: (w, w), d.suggest(keyword))
示例#22
0
import nltk 
import csv, collections
from nltk import word_tokenize, pos_tag, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import enchant
import numpy as np
from numpy import genfromtxt
import sys

tokenizer = RegexpTokenizer(r'\w+')
stop = stopwords.words('english')
dictionary = enchant.request_dict("en_US")
stemmer = SnowballStemmer("english")

def clean_ques(ques):
	ques = ques.lower()
	ques = tokenizer.tokenize(ques)
	for i in range(len(ques)):
		if not enchant.dict_exists(ques[i]):
 			ques[i] = dictionary.suggest(ques[i])[0]
	#ques = [q for q in ques if q not in stop]
	ques = [stemmer.stem(q) for q in ques if q not in stop]
	return ques
 def setDictionary(self):        
     import dictLangDialog
     dialog = dictLangDialog.DictLangDialog()
     try:
         dialog.show()
         lang = dialog.selectedLang
         if lang == "United States" and enchant.dict_exists("en_US"):
             self.wordlist = enchant.request_dict("en_US")
         elif lang == "Chinese" and enchant.dict_exists("zh"):
             self.wordlist = enchant.request_dict("zh")
         elif lang == "Russian" and enchant.dict_exists("ru"):
             self.wordlist = enchant.request_dict("ru")
         elif lang == "German" and enchant.dict_exists("de_DE"):
             self.wordlist = enchant.request_dict("de_DE")
         elif lang == "French" and enchant.dict_exists("fr_FR"):
             self.wordlist = enchant.request_dict("fr_FR")
         elif lang == "Norwegian" and enchant.dict_exists("no"):
             self.wordlist = enchant.request_dict("no")
         elif lang == "Zulu" and enchant.dict_exists("zu"):
             self.wordlist = enchant.request_dict("zu")
         elif lang == "Arabic" and enchant.dict_exists("ar"):
             self.wordlist = enchant.request_dict("ar")
         elif lang == "Hindi" and enchant.dict_exists("hi"):
             self.wordlist = enchant.request_dict("hi")
         elif lang == "British" and enchant.dict_exists("en_GB"):
             self.wordlist = enchant.request_dict("en_GB")
     except:
         self.wordlist = enchant.request_dict("en_GB")
示例#24
0
def main():
    global g_lang
    global g_dictionnary
    global g_tknzr
    global g_nb_total_pages
    global g_start_time

    print("Will use {} for OCR".format(OCR_TOOL.get_name()))

    print("Initializing dictionnary ...")
    g_lang = "eng"
    if len(sys.argv) > 1:
        g_lang = "fra"

    g_dictionnary = enchant.request_dict(g_lang[:2])
    try:
        g_tknzr = enchant.tokenize.get_tokenizer(g_lang[:2])
    except enchant.tokenize.TokenizerNotFoundError as exc:
        print("Warning: Falling back to default tokenizer ({})".format(exc))
        g_tknzr = enchant.tokenize.get_tokenizer()
    print("Done")

    print("Loading documents list ...")
    pconfig = config.PaperworkConfig()
    pconfig.read()
    work_dir = pconfig.settings['workdir'].value
    dsearch = docsearch.DocSearch(work_dir)
    dsearch.reload_index()
    print("Documents loaded")
    print("")

    print("Initalizing workers ...")
    manager = WorkerManager()
    manager.start()

    factory = JobFactoryImageProcessing()
    print("Done")

    g_start_time = datetime.datetime.now()

    try:
        print("Queueing jobs ...")
        nb_docs = 0
        nb_pages = 0
        for doc in dsearch.docs:
            if not doc.can_edit:  # probably not an OCR-ized doc
                continue
            nb_docs += 1
            for page in doc.pages:
                if not page.can_edit:  # probably not an OCR-ized page
                    continue
                nb_pages += 1
                g_nb_total_pages += 1
                for algos in ALGORITHMS:
                    job = factory.make(page, algos)
                    manager.schedule(job)

        print("Queued jobs : {} docs | {} pages".format(nb_docs, nb_pages))

        manager.wait_for_all()
    finally:
        manager.stop()
示例#25
0
 def __init__(self, languages=None):
     self.dictionaries = OrderedDict()
     for language in languages or self.languages:
         self.dictionaries[language] = request_dict(language)
     self.normalizer = PunctuationRemover()
     self.tokenizer = SpaceTokenizer()