示例#1
0
def spelling(file_name, contents, language="en_US"):
    """
    You give it a file_name and the contents of that file and it tells you
    if it's spelled correctly.  The reason you give it contents is that you
    will typically run a template through the render process, so spelling 
    can't just load a file and check it.

    It assumes you have PyEnchant installed correctly and configured 
    in your config/testing.py file.  Use "salmon spell" to make sure it
    works right.
    """
    try:
        from enchant.checker import SpellChecker
        from enchant.tokenize import EmailFilter, URLFilter
    except ImportError:
        print "Failed to load PyEnchant.  Make sure it's installed and salmon spell works."
        return True

    failures = 0
    chkr = SpellChecker(language, filters=[EmailFilter, URLFilter])
    chkr.set_text(contents)
    for err in chkr:
        print "%s: %s \t %r" % (file_name, err.word, contents[err.wordpos - 20:err.wordpos + 20])
        failures += 1

    if failures:
        print "You have %d spelling errors in %s.  Run salmon spell.." % (failures, file_name)
        return False
    else:
        return True
示例#2
0
def spell_checker(text, dict_language='ru_RU', answer_type='cutted'):
    word_checker = enchant.Dict(dict_language)
    text_checker = SpellChecker('ru_RU')
    text_proc = text.split()
    start_time = datetime.now()
    print(text)
    for position in range(len(text_proc) - 1):
        text_checker.set_text(text_proc[position])
        for err in text_checker:
            if len(err.word) > 1:
                fixed = word_checker.suggest(err.word)
                #print ('ERROR:', err.word, ', replaced by:', fixed)
                if err.word[0].isupper():
                    result = names.search_by_name(err.word, fixed)
                    if result == 0:
                        fixed = freq.get_freq(fixed, err.word)
                    else:
                        fixed[0] = result
                else:
                    fixed = freq.get_freq(fixed, err.word)

                if len(fixed) != 0:
                    if answer_type == 'cutted':
                        text_proc[position] = re.sub(
                            r'\b{}\b'.format(re.escape(err.word)), fixed[0],
                            text_proc[position])
                    if answer_type == 'full':
                        text_proc[position] = re.sub(
                            r'\b{}\b'.format(re.escape(err.word)), str(fixed),
                            text_proc[position])
    print('CHECK TIME', datetime.now() - start_time)

    answer = ' '.join(text_proc)
    return answer
示例#3
0
 def __init__(self, language):
     try:
         self.dict = enchant.Dict(language)
         self.checker = SpellChecker(
             language, filters=[EmailFilter, URLFilter, CamelCaseFilter])
     except enchant.DictNotFoundError:
         self.checker = None
示例#4
0
    def __init__(self):

        self._url_list = []

        self._image_url_list = []

        self._image_list = []

        self._reddit = reddit = praw.Reddit('bot1')

        self._subreddit = reddit.subreddit("getmotivated")

        self._counter = 0

        self._text_array = []

        self._spellchecked_array = []

        self._context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)

        self._enchant_dict = enchant.Dict("en_US")

        self._chkr = SpellChecker("en_US")

        self._size = 1800, 3200

        self._wrong_words = 0

        self._word_conferences = []

        self._new_average = 0
示例#5
0
文件: spell_helper.py 项目: nvhuy/LM
class EnchantSpell:
    def __init__(self):
        print enchant.Broker().list_dicts()
        print enchant.Broker().list_languages()
        self.__spell_checker__ = SpellChecker(lang='en_US')

    def check_token(self, token):
        """
        Check spelling of a single word
        :param token: input word
        :return: True/False
        """
        return self.__spell_checker__.check(
            token) or self.__spell_checker__.check(token.capitalize())

    def check_tokens(self, tokens):
        """
        Check spelling of a list of words
        :param tokens: input word list
        :return: a list of index of mis-spelled words
        """
        error_idx = []
        for tid, token in enumerate(tokens):
            if not self.check_token(token):
                error_idx.append(tid)
        return error_idx

    def suggest_correction(self, token):
        """
        Suggest correction to a mis-spelled word
        :param token: input word
        :return: list of suggestion
        """
        return self.__spell_checker__.suggest(token)
示例#6
0
 def __init__(self, text, dictionary):
  super(spellChecker, self).__init__()
  log.debug("Creating the SpellChecker object. Dictionary: %s" % (dictionary,))
  self.active = True
  try:
   if config.app["app-settings"]["language"] == "system":
    log.debug("Using the system language")
    self.checker = SpellChecker(filters=[twitterFilter.TwitterFilter, tokenize.EmailFilter, tokenize.URLFilter])
   else:
    log.debug("Using language: %s" % (languageHandler.getLanguage(),))
    self.checker = SpellChecker(languageHandler.getLanguage(), filters=[twitterFilter.TwitterFilter, tokenize.EmailFilter, tokenize.URLFilter])
   self.checker.set_text(text)
  except DictNotFoundError:
   log.exception("Dictionary for language %s not found." % (dictionary,))
   wx_ui.dict_not_found_error()
   self.active = False
  if self.active == True:
   log.debug("Creating dialog...")
   self.dialog = wx_ui.spellCheckerDialog()
   widgetUtils.connect_event(self.dialog.ignore, widgetUtils.BUTTON_PRESSED, self.ignore)
   widgetUtils.connect_event(self.dialog.ignoreAll, widgetUtils.BUTTON_PRESSED, self.ignoreAll)
   widgetUtils.connect_event(self.dialog.replace, widgetUtils.BUTTON_PRESSED, self.replace)
   widgetUtils.connect_event(self.dialog.replaceAll, widgetUtils.BUTTON_PRESSED, self.replaceAll)
   self.check()
   self.dialog.get_response()
   self.fixed_text = self.checker.get_text()
示例#7
0
def spell_check(request):
    """
    Implements the TinyMCE 4 spellchecker protocol

    :param request: Django http request with JSON-RPC payload from TinyMCE 4
        containing a language code and a text to check for errors.
    :type request: django.http.request.HttpRequest
    :return: Django http response containing JSON-RPC payload
        with spellcheck results for TinyMCE 4
    :rtype: django.http.HttpResponse
    """
    data = json.loads(request.body.decode('utf-8'))
    output = {'id': data['id']}
    error = None
    try:
        import enchant
        from enchant.checker import SpellChecker
        if data['params']['lang'] not in enchant.list_languages():
            error = 'Missing {0} dictionary!'.format(data['params']['lang'])
            raise RuntimeError(error)
        checker = SpellChecker(data['params']['lang'])
        checker.set_text(strip_tags(data['params']['text']))
        output['result'] = {checker.word: checker.suggest() for err in checker}
    except ImportError:
        error = 'The pyenchant package is not installed!'
        logger.exception(error)
    except RuntimeError:
        logger.exception(error)
    except Exception:
        error = 'Unknown error!'
        logger.exception(error)
    if error is not None:
        output['error'] = error
    return HttpResponse(json.dumps(output),
                        content_type='application/json; charset=UTF-8')
示例#8
0
def testAccuracy(inputString):
    errors = 0
    chkr = SpellChecker("en_US")
    chkr.set_text(inputString)
    for err in chkr:
        errors += 1
    return errors
示例#9
0
def correctText(text):
    print("*"*78)

    mistakes = 0

    text = text.casefold()
    
    chkr = SpellChecker("en_US",filters=[EmailFilter,URLFilter])
    chkr.set_text(text)

    for err in chkr:
        ww = get_close_matches(err.word,data.keys(),n=5)

        list2String = ", ".join(ww)
        if len(ww) > 0 :
            print("[-] " + err.word + " Is Not an English Word, Do U Mean : " + list2String + " ?")
            mistakes += 1            
        else:
            print("[-] " + err.word + " Is Not an English Word, But I Don't Know What The F**k Do u Mean")
            mistakes += 1

    if mistakes == 0:
        print("[+] Its All Correct !")
    else:
        print("*"*78)
        print("[-] You Have " + str(mistakes) + " Mistakes !")
示例#10
0
class SpellCheckHighlighter(QSyntaxHighlighter):
	def __init__(self, parent, language):
		QSyntaxHighlighter.__init__(self, parent)
		self._language = language
		if not enchantAvailable:
			self._checker = None
			return
		try:
			self._checker = SpellChecker(self._language)
		except enchant.DictNotFoundError: 
			self._checker = None
			Debug.info(_('SpellChecking: No dictionary available for language "%s"') % self._language)

		self._format = QTextCharFormat()
		self._format.setUnderlineColor(QColor(Qt.red));
		self._format.setUnderlineStyle(QTextCharFormat.SpellCheckUnderline);

	def highlightBlock(self, text):
		if not enchantAvailable or not self._checker:
			return

		text = unicode(text)
		self._checker.set_text( text )
		for error in self._checker:
			self.setFormat(error.wordpos, len(error.word), self._format)
def spellcheck(text):
    #remove all special characters apart from characterset below
    text = re.sub('[^A-Za-z0-9.,!?\s\\xf6\\xfc\\xe4\\xdf]', '', text)

    #spell checking
    chkr = SpellChecker("de_CH", text)
    lasterror = ''
    lastlasterror = ''
    Nerr = 0
    for err in chkr:
        repl = ''
        if lastlasterror == err.word:
            continue
        if lasterror == err.word:
            continue
        lastlasterror = lasterror
        lasterror = err.word
        Nerr += 1
        repl = err.suggest()
        try:
            err.replace(repl[0])
        except:
            err.replace(u"spellchecker_fail")

    text = chkr.get_text()
    return text
示例#12
0
class SpellCheckThread(threading.Thread):
    def __init__(self, root_widget: App, name: str = "<\'SpellCheckThread\'>"):
        threading.Thread.__init__(self, name=name)
        self.thread_name = name
        self.language = "en_US"
        self.dictionary = SpellChecker(
            self.language
        )  # or import enchant and use Dict() to check individual words
        self.thread_flag = True
        self.setDaemon(True)
        self.root_widget = root_widget
        self.lock_object = threading.Lock()

        # tag config for misspelled words
        root_widget.text_area.tag_config("misspelled",
                                         foreground="red",
                                         underline=True)

    def run(self):

        while self.thread_flag:
            time.sleep(1)
            self.lock_object.acquire()
            input_text: str = self.root_widget.text_area.get("1.0", "end-1c")
            self.dictionary.set_text(input_text)
            list_of_words = input_text.split()

            index = "1.0"
            for word in list_of_words:
                index = self.root_widget.text_area.search(word,
                                                          index,
                                                          nocase=1,
                                                          stopindex="end")
                if not index:
                    break
                last_index = "%s+%dc" % (index, len(word))
                self.root_widget.text_area.tag_remove("misspelled", index,
                                                      last_index)
                index = last_index

            index = "1.0"
            for error in self.dictionary:
                index = self.root_widget.text_area.search(error.word,
                                                          index,
                                                          nocase=1,
                                                          stopindex="end")
                if not index:
                    break
                last_index = "%s+%dc" % (index, len(error.word))
                self.root_widget.text_area.tag_add("misspelled", index,
                                                   last_index)
                index = last_index

            self.lock_object.release()

    def start_spell_check(self):
        self.start()

    def stop_spell_check(self):
        self.thread_flag = False
示例#13
0
def spell(text):
    """<word/sentence> - Check spelling of a word or sentence."""
    if len(text.split(" ")) > 1:
        # input is a sentence
        checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter])
        checker.set_text(text)

        is_correct = True
        offset = 0
        for err in checker:
            is_correct = False
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "[h1]{}[/h1]".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
            # replace the word with the suggestions
            text = text[:start] + s_string + text[finish:]
        return "$(green)Correct$(c)" if is_correct else text
    else:
        # input is a word
        is_correct = en_dict.check(text)
        suggestions = en_dict.suggest(text)
        s_string = ', '.join(suggestions[:10])
        return '"{}" appears to be {} [div] [h1]Similar:[/h1] {}'.format(
            text,
            "$(green)correct$(c)" if is_correct else "$(red)incorrect$(c)",
            s_string)
示例#14
0
 def spellcheck(text):  #getting spellcheck
     chkr = SpellChecker("en_US")
     chkr.set_text(text)
     c = 0
     for _ in chkr:
         c = c + 1
     return c
示例#15
0
 def __init__(self,
              sc_language="en_US",
              bert_model="distilbert-base-uncased"):
     self.sc = SpellChecker(sc_language)
     self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
     self.model = AutoModelForMaskedLM.from_pretrained(bert_model)
     print("> BERT loaded")
 def retSpErrCount(self):
     chkr = SpellChecker('en_US','en_GB')
     chkr.set_text(self.file_content)
     count = 0
     for err in chkr:
         count += 1
     return count
示例#17
0
def control_noerrorwords(textInput, jdlist):
    """
    """
    words = []
    checker = SpellChecker('en_US')
    checker.set_text(textInput)
    for errorword in checker:
        words.append(errorword.word)
    words = list(set(words))
    crossWords = set(words).difference(set(jdlist))
    count = len(crossWords)
    cvtext = textInput

    pWordsFile = open('Utility_MyWords.txt', 'r')
    pWords = set(list(lib_str.split(lib_re.sub('\n',' ', \
                    pWordsFile.read()),' ')))
    pWordsFile.close()

    crossWords = crossWords.difference(pWords)

    if count == 0:
        for e in words:
            cvtext = lib_re.sub('(\s+' + e + '\s)', ' ', cvtext)
    else:
        for e in crossWords:
            cvtext = lib_re.sub('(\s+' + e + '\s)', ' ', cvtext)
    return cvtext
示例#18
0
def spell_check(request):
    """
    Implements the TinyMCE 4 spellchecker protocol

    :param request: Django http request with JSON-RPC payload from TinyMCE 4
        containing a language code and a text to check for errors.
    :type request: django.http.request.HttpRequest
    :return: Django http response containing JSON-RPC payload
        with spellcheck results for TinyMCE 4
    :rtype: django.http.HttpResponse
    """
    data = json.loads(request.body.decode("utf-8"))
    output = {"id": data["id"]}
    error = None
    try:
        import enchant
        from enchant.checker import SpellChecker

        if data["params"]["lang"] not in enchant.list_languages():
            error = "Missing {} dictionary!".format(data["params"]["lang"])
            raise RuntimeError(error)
        checker = SpellChecker(data["params"]["lang"])
        checker.set_text(strip_tags(data["params"]["text"]))
        output["result"] = {checker.word: checker.suggest() for err in checker}
    except ImportError:
        error = "The pyenchant package is not installed!"
        logger.exception(error)
    except RuntimeError:
        logger.exception(error)
    except Exception:
        error = "Unknown error!"
        logger.exception(error)
    if error is not None:
        output["error"] = error
    return JsonResponse(output)
示例#19
0
def process_single_language(schema_name, language):
    logger.info(
        "Calculating spelling errors for schema {} using language {}".format(
            schema_name, language))

    #load spell checker for this language
    try:
        chkr = SpellChecker(language)
    except:
        logger.warn("Error using dictionary {}, skipping".format(language))
        return

    query = """select id, post_text from forum_posts"""
    results = get_connection(schema_name).execute(query)
    insert_values = []
    for row in results:
        logger.debug(
            "Working on course {}, post {}, with a length of length {} using language {}"
            .format(schema_name, row[0], len(row[1]), language))
        chkr.set_text(row[1])
        cnt = 0
        for err in chkr:
            cnt += 1
        stats = {
            "course_id": schema_name,
            "forum_posts_id": row[0],
            "language": language,
            "score": cnt
        }
        insert_values.append(stats)
        if len(insert_values) == BATCH_SIZE:
            batch_insert(conn, tbl_coursera_message_language, insert_values)
            insert_values = []
    batch_insert(conn, tbl_coursera_message_language, insert_values)
    insert_values = []
示例#20
0
def spell(text):
    """spell <word/sentence> -- Check spelling of a word or sentence."""
    if len(text.split(" ")) > 1:
        # input is a sentence
        checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter])
        checker.set_text(text)

        offset = 0
        for err in checker:
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "\x02{}\x02".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
            # replace the word with the suggestions
            text = text[:start] + s_string + text[finish:]
        return text
    else:
        # input is a word
        is_correct = en_dict.check(text)
        suggestions = en_dict.suggest(text)
        s_string = ', '.join(suggestions[:10])
        if is_correct:
            return '"{}" appears to be \x02valid\x02! ' \
                   '(suggestions: {})'.format(text, s_string)
        else:
            return '"{}" appears to be \x02invalid\x02! ' \
                   '(suggestions: {})'.format(text, s_string)
示例#21
0
def decipher(ciphertext):
    checker = SpellChecker("en_US")
    best_key = -1
    least_num_errors = len(ciphertext)
    for i in range(0, 26):
        plaintext = ''
        for j in range(0, len(ciphertext)):
            if ciphertext[j].isalpha():
                pos = ord(ciphertext[j])
                offset = 97
                if ciphertext[j].isupper():
                    offset = 65
                new_pos = (pos - offset + 26 - i) % 26 + offset
                plaintext += unichr(new_pos)
            else:
                plaintext += ciphertext[j]
        checker.set_text(plaintext)
        num_errors = 0
        for err in checker:
            num_errors = num_errors + 1
        if num_errors < least_num_errors:
            least_num_errors = num_errors
            best_key = i
        words = plaintext.split()
        en_words = len(words) - num_errors
        print("%i: %s English words: %i" % (i, plaintext, en_words))
    return "%s %i" % ("The key is most likely: ", best_key)
示例#22
0
def CorrectBot(interface,command,args,messagetype):
    "!correct [n=previous] [words=all of them] - Try to correct the words in the nth message before this one."
    words=''
    chkr = SpellChecker("en_UK",filters=[EmailFilter,URLFilter])

    try:
        a=args.split()
        n=int(args.split()[0])
        if len(a)>1:
            words=args.partition(" ")[2]
        else:
            words=interface.LastMessages[n].Body
    except:
        n=0
        words=interface.LastMessages[0].Body

    if not interface.LastMessages[n].IsEditable:
        SpellBot(interface,'spell',words,messagetype,onlyerror=True)
        return

    text=interface.LastMessages[n].Body
    origtext = text

    chkr.set_text(words)
    for err in chkr:
        w = err.suggest()#Spell(word[0])
        if w:
            if w!=err.word:
                text = text.replace(err.word,w[0])

    if origtext!=text:
        interface.LastMessages[n].Body=text
示例#23
0
    def read_lang(self, image_name, text):
        if self.model.readLang(text) == 'en':
            filename = self.model.getFileName()
            text = image_to_string(Image.open(filename))
            text_original = str(text)
            # cleanup text
            text = text_replace(text)

            persons_list = get_persons_list(text)
            ignore_words = persons_list + ["!", ",", ".", "\"", "?", '(', ')', '*', '\'']
            # using enchant.checker.SpellChecker, identify incorrect words
            spell = SpellChecker("en_US")
            words = text.split()
            incorrect_words = [w for w in words if not spell.check(w) and w not in ignore_words]
            # using enchant.checker.SpellChecker, get suggested replacements
            suggested_words = [spell.suggest(w) for w in incorrect_words]
            # replace incorrect words with [MASK]
            text, text_original = replace_incorrect(incorrect_words, text, text_original)

            id_mask, model, segments_tensor, tokenizer, tokens_tensor = evaluate_tokens(text)
            # Predict all tokens
            with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensor)

            # Refine prediction by matching with proposals from SpellChecker
            text = predict_word(id_mask, tokenizer, suggested_words, text_original, predictions)
        else:
            text = pytesseract.image_to_string(Image.open(image_name), config='-l jpn')
            text = re.sub(" ", "", text)
        return text
示例#24
0
文件: decipher.py 项目: mfaywu/lab
def decipher(ciphertext):
    checker = SpellChecker("en_US")
    best_key = -1
    least_num_errors = len(ciphertext)
    for i in range(0, 26):
        plaintext = ''
        for j in range(0, len(ciphertext)):
            if ciphertext[j].isalpha():
                pos = ord(ciphertext[j])
                offset = 97
                if ciphertext[j].isupper():
                    offset = 65
                new_pos = (pos - offset + 26 - i) % 26 + offset
                plaintext += unichr(new_pos)
            else:
                plaintext += ciphertext[j]
        checker.set_text(plaintext)
        num_errors = 0
        for err in checker:
            num_errors = num_errors + 1
        if num_errors < least_num_errors:
            least_num_errors = num_errors
            best_key = i
        words = plaintext.split()
        en_words = len(words) - num_errors
        print("%i: %s English words: %i" % (i, plaintext, en_words))
    return "%s %i" % ("The key is most likely: ", best_key)
	def spellcheck_text(self):
		"""Spellcheckes text and saves spellchecked text in 
		self.text.
		"""

		# Variable declaration
		errors = list() # spelling errors
		chkr = SpellChecker('de_DE') # spellchecker for whole text
		dic = enchant.Dict('de_DE') # enchant dict
		
		# Run spellchecker over whole text
		chkr.set_text(self.text)
		
		# Loop over every error 
		for err in chkr:
			# Save error in errors list
			errors.append(err.word)

		# There are errors
		if len(errors) > 0:
			# Replace errors with proper word
			for error in errors:
				
				# Check if there is a substitute
				try:
					self.text = self.text.replace(error, dic.suggest(error)[0])
				except IndexError:
					pass
示例#26
0
def spell(text):
    """spell <word/sentence> -- Check spelling of a word or sentence."""
    if len(text.split(" ")) > 1:
        # input is a sentence
        checker = SpellChecker(en_dict, filters=[EmailFilter, URLFilter])
        checker.set_text(text)

        offset = 0
        for err in checker:
            # find the location of the incorrect word
            start = err.wordpos + offset
            finish = start + len(err.word)
            # get some suggestions for it
            suggestions = err.suggest()
            s_string = '/'.join(suggestions[:3])
            s_string = "\x02{}\x02".format(s_string)
            # calculate the offset for the next word
            offset = (offset + len(s_string)) - len(err.word)
            # replace the word with the suggestions
            text = text[:start] + s_string + text[finish:]
        return text
    else:
        # input is a word
        is_correct = en_dict.check(text)
        suggestions = en_dict.suggest(text)
        s_string = ', '.join(suggestions[:10])
        if is_correct:
            return '"{}" appears to be \x02valid\x02! ' \
                   '(suggestions: {})'.format(text, s_string)
        else:
            return '"{}" appears to be \x02invalid\x02! ' \
                   '(suggestions: {})'.format(text, s_string)
示例#27
0
def filter_ngrams(terms, spelling=False, singletons=True, contains_numeric=False, contains_alpha=False, contains_non_alphanumeric=False):
    """
        Filter n-grams by a variety of features
    """
    chkr = SpellChecker("en_US")
    print(len(terms), "n-grams before filter")
    if spelling:
        for k in list(terms.keys()):
            chkr.set_text(k)
            errors = set()
            for err in chkr:
                errors.add(err.word)
            if len(errors) > 0:
                del terms[k]
    if singletons:
        for k,v in list(terms.items()):
            if len(v) == 1:
                del terms[k]
    if contains_numeric:
        for k in list(terms.keys()):
            if re.search("[^0-9]",k):
                del terms[k]
    if contains_alpha:
        for k in list(terms.keys()):
            if re.search("[^a-z]",k):
                del terms[k]
    if contains_non_alphanumeric:
        for k in list(terms.keys()):
            if re.search("[^[:alnum:]]",k):
                del terms[k]
    print(len(terms), "n-grams after filter")
    return terms
def matchlocations(line):
	from enchant.checker import SpellChecker
	chkr = SpellChecker("en_US")
	global sdx 
	global sdx4 	
	global sdx3 
	global places

	i=0
	chkr.set_text(line)
	for err in chkr:
		#print "error",err.word
		toprint = err.word
		word = err.word.upper()
		mind = 4
		replace = []
		flag = False
		soundFlag = False
		noFlag = False
		if word in places:
			line = line.replace(toprint,"<loc>"+word.lower()+"</loc>")
		else:
			for place in places:
				dist = minEditDist(word,place)
				if dist<mind:
					replace=[]
					replace.append(place)
					mind = dist
					flag = True
				else:
					if dist == mind:
						replace.append(place)
						flag == True
			if flag == True and len(word) > mind:
				if mind ==1 and len(replace)==1:
					line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>")    
				else:
					if(soundex2(word,4) in sdx4 and len(word)>3):
						line = line.replace(toprint,"<loc>"+sdx4[soundex2(word,4)].lower()+"</loc>")
					elif(soundex2(word,3) in sdx3 and len(word)>3):
						line = line.replace(toprint,"<loc>"+sdx3[soundex2(word,3)].lower()+"</loc>")
					elif(dm(word)[0] in sdx and len(word)>3):
						line = line.replace(toprint,"<loc>"+sdx[dm(word)[0]].lower()+"</loc>")  
					else:
						if len(replace) == 1:
							line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>")
						else:
							#print replace
							for ele in replace:
								if(dm(ele)[0] == dm(toprint)[0]):
									line = line.replace(toprint,"<loc>"+ele.lower()+"</loc>")
									soundFlag = True
									break
							if soundFlag == False:
								line = line.replace(toprint,"<loc>"+replace[0].lower()+"</loc>")
			else:
				if (dm(word)[0] in sdx and len(word)>3):
					line = line.replace(toprint,"<loc>"+sdx[dm(word)[0]].lower()+"</loc>")
	line = line.replace('\r','')
	print line
示例#29
0
def spellcheck_hints(args, packages):
    spelldict = DictWithPWL('en-US')
    chkr = SpellChecker(spelldict, filters=[DescFilter])
    misspellings = {}

    # add technical words not in spell-checking dictionary
    wordlist = []
    with open('words.txt') as f:
        for w in f:
            # strip any trailing comment
            w = re.sub(r'#.*$', '', w)
            # strip any whitespace
            w = w.strip()
            spelldict.add(w)
            wordlist.append(w.lower())
            # XXX: for the moment, to reduce the set of errors, ignore the fact
            # that words.txt gives a canonical capitalization, and accept any
            # capitalization
            spelldict.add(w.lower())
            spelldict.add(w.capitalize())

    # add all package names as valid words
    for p in packages:
        for w in re.split('[_-]', p):
            # remove punctuation characters
            w = re.sub(r'[+]', '', w)
            # strip off any trailing numbers
            w = re.sub(r'[\d.]*$', '', w)

            # both with and without any lib prefix
            for wl in [w, re.sub(r'^lib', '', w)]:
                # add the package name unless it exists in the list above, which
                # will give a canonical capitalization
                if wl.lower() not in wordlist:
                    spelldict.add(wl.lower())
                    spelldict.add(wl)
                    spelldict.add(wl.capitalize())

    # for each package
    for p in sorted(packages.keys()):
        # debuginfo packages have uninteresting, auto-generated text which
        # contains the package name
        if p.endswith('-debuginfo'):
            continue

        # spell-check the spell-checkable keys
        for k in ['sdesc', 'ldesc', 'message']:
            if k in packages[p].hints:
                chkr.set_text(packages[p].hints[k])
                # XXX: this is doing all the work to generate suggestions, which
                # we then ignore, so could be written much more efficiently
                for err in chkr:
                    # print("package '%s', hint '%s': Is '%s' a word?" % (p, k, err.word))
                    misspellings.setdefault(err.word, 0)
                    misspellings[err.word] += 1

    # summarize
    for c in sorted(misspellings, key=misspellings.get, reverse=True):
        print('%16s: %4d' % (c, misspellings[c]))
示例#30
0
def test_replace_with_empty_string():
    """Testcase for replacing with an empty string (bug #10)"""
    text = ". I Bezwaar tegen verguning."
    chkr = SpellChecker("en_US", text)
    for i, err in enumerate(chkr):
        err.replace("")
        assert i < 3
    assert chkr.get_text() == ". I   ."
示例#31
0
 def __init__(self, language="en_US"):
     if not enchant.dict_exists(language):
         logging.warning("Spelling_Corrector: Don't have {} , Please check it!!!", language)
         logging.warning("Recommend same language for you: {}", enchant.list_languages())
         language = "en_US"
     self.dict = enchant.Dict(language)
     self.check = SpellChecker(language)
     self.tokenizer = get_tokenizer(language)
def spell_checker_featurizer(feature_counter, essay, essay_set=None):
    chkr = SpellChecker("en_UK", "en_US")
    chkr.set_text(essay)
    counter = 0
    for error in chkr:
        counter += 1
    #print(counter)
    feature_counter['spell_checker'] = counter
示例#33
0
def correct(text):
	chkr = SpellChecker("en_US")
	chkr.set_text(text)
	for err in chkr:
		sug = err.suggest()
		if sug:
			err.replace(sug[0])
	return chkr.get_text()
示例#34
0
 def retSpErrCount(self):
     #considers both UK and US dictionary
     chkr = SpellChecker('en_US', 'en_GB')
     chkr.set_text(self.file_content)
     count = 0
     for err in chkr:
         count += 1
     return count
def performSpellCorrection(featureObj):
    checker = SpellChecker("en_US", featureObj.getText())
    for word in checker:
        word.replace(spell(word.word))

    featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text())

    return featureObj
示例#36
0
def test_default_language():
    lang = get_default_language()
    if lang is None:
        with pytest.raises(DefaultLanguageNotFoundError):
            SpellChecker()
    else:
        checker = SpellChecker()
        assert checker.lang == lang
示例#37
0
def spell_check(response):
    count = 0
    chkr = SpellChecker("en_US")
    chkr.set_text(response)
    for err in chkr:
        if err:
            count += 1
    return count
示例#38
0
def spellCheck(text):
    chkr = SpellChecker("en_US", text)
    for err in chkr:
        repls = chkr.suggest(err.word)
        if len(repls) > 0:
            repl = repls[0]
            err.replace(repl)
    return chkr.get_text()
def spellcheck(sentence):
	checker = SpellChecker("en_US")
	checker.set_text(sentence)
	for error in checker:
		for suggestion in error.suggest():
			if error.word.replace(' ','') == suggestion.replace(' ',''):
				error.replace(suggestion)
				break
	return checker.get_text()
示例#40
0
 def Errores(self):
     terrores = 0
     texto = open(self.archivo, 'r')
     chkr = SpellChecker("es_ES")
     chkr.set_text(texto.readline())
     for err in chkr:
      terrores+=1
     terrores=str(terrores)
     self.ui.errores.setText(terrores)
示例#41
0
 def spellcheck(self):
     errs = 0
     words = []
     chkr = SpellChecker("en_US")
     chkr.set_text(self.text)
     for err in chkr:
         errs += 1
         words.append(err.word)
     return [errs, words]
示例#42
0
def has_error(file_path):
    """return boolean indicating whether the file specified by the
    file_path contains spelling mistakes
    """
    with open(file_path, "r") as file_to_check:
        data = file_to_check.read()
        checker = SpellChecker("en_US")
        checker.set_text(data)
        for err in checker:
            return True
        return False
示例#43
0
def user_stats(messages, usermap):
	"""Keys: (posts, likes_recieved, likes_given, wordcount, images, misspellings, kicked)"""
	stats = {}
	checker = SpellChecker('en_US')
	for user_id in usermap:
		stats[usermap[user_id]] = {
			'posts': [],
			'likes_recieved': 0,
			'likes_given': 0,
			'wordcount': 0,
			'images': 0,
			'misspellings': [],
			'kicked': 0,
			'been_kicked': 0 }
	current_names = {} # map user id to alias at the time of each message
	for m in reversed(messages):
		current_names[m['sender_id']] = m['name']
		if m['user_id'] == 'system':
			if m['text'] is not None:
				if ' changed name to ' in m['text']:
					s = m['text'].split(' changed name to ')
					for uid in current_names:
						if current_names[uid] == s[0]:
							current_names[uid] = s[1]
				elif ' removed ' in m['text']:
					s = m['text'][:-16].split(' removed ')
					remover = 0
					removed = 0
					for uid in current_names:
						if current_names[uid] == s[0]: remover = uid
						if current_names[uid] == s[1]: removed = uid
					if remover != 0 and removed != 0:
						stats[usermap[remover]]['kicked'] += 1
						stats[usermap[removed]]['been_kicked'] += 1
		name = usermap[m['sender_id']]
		stats[name]['posts'].append(m)
		stats[name]['likes_recieved'] += len(m['favorited_by'])
		for liker in m['favorited_by']:
			try:
				likername = usermap[liker]
				stats[likername]['likes_given'] += 1
			except KeyError:
				pass
		stats[name]['images'] += 1 if len(m['attachments']) > 0 else 0
		if m['text'] is not None:
			stats[name]['wordcount'] += len(m['text'].split(' '))
			checker.set_text(m['text'])
			stats[name]['misspellings'] += [error.word for error in list(checker)]
		
	del stats['GroupMe']
	del stats['GroupMe Calendar']
	del stats['Annie Hughey']
	del stats['Nicole Vergara']
	return stats
def very_rare_long_words(debate, longwords):             #takes lemmatised debate and the long words and checks which one has a frequency distribution of 1

	fdist = FreqDist(debate)
	chkr = SpellChecker("en_GB", debate)

	very_rare = []

	for w in longwords:
		if fdist[w] < 2 and chkr.check(w) == True:
			very_rare.append(w.encode('utf-8'))

	return very_rare
示例#45
0
def suggest_correction(file_path):
    """return string representing the spell-corrected content of
    the file specified by the file_path
    """
    with open(file_path, "r") as file_to_check:
        data = file_to_check.read()
        checker = SpellChecker("en_US")
        checker.set_text(data)
        for err in checker:
            # avoid IndexOutOfBounds
            err.replace(checker.suggest()[0])
        return checker.get_text()
示例#46
0
    def check_errors(self, langs, text):
        errors = []
        for lang in langs:
            if lang in enchant.list_languages():
                chkr = SpellChecker(lang)
                chkr.set_text(text)

                errors_set = set()
                for err in chkr:
                    errors_set.add(err.word)
                errors.append(errors_set)

        return set.intersection(*errors)
示例#47
0
def doSpelling(str):
	
	chkr = SpellChecker("en_GB"); # any english dictionary

	chkr.set_text(str);

	## we can do more similarity ratio checks as well :P right now pick the best match
	for err in chkr:
		try:
			str = re.sub(err.word,chkr.suggest(err.word)[0],str)
		except Exception, e:
			print 'no suggestions: {0}'.format(err.word)
			continue
示例#48
0
    def on_data(self, text):
        data = json.loads(text)
        for battle_hashtag in self.battle_hashtags:
            if battle_hashtag.hashtag.value.lower() in data['text'].lower():
                checker = SpellChecker('en_GB')
                checker.set_text(data['text'])
                for typo in checker:
                    battle_hashtag.typos+=1

                battle_hashtag.words += len(data['text'].split(' '))
                battle_hashtag.save()

        return True
def long_words(debate):                                   #long words in debate

	lemmatised_words = debate

	long_words = [w for w in lemmatised_words if len(w) > 9]
	long_words = set(long_words)

	long_words_correct = []
	chkr = SpellChecker("en_GB", debate)                   #spellchecking

	for word in long_words:
		if chkr.check(word) == True:
			long_words_correct.append(word.encode('utf-8'))

	return long_words_correct                              #returns list of long words
def average_freqdist(argument, debate):
	argument = set(argument)
	arg_length = len(argument)
	fdist = FreqDist(debate)
	chkr = SpellChecker("en_GB", debate)

	frequency = 0

	for w in argument:
		if chkr.check(w) == True:
			frequency = frequency + fdist[w]

	average_frequency = float(frequency) / arg_length

	return average_frequency
示例#51
0
文件: JITProfile.py 项目: hobson/Util
 def __init__(self) :
     App.__init__(self)
     self.width = int(self.config.fetch('page_width'))
     # regarding adding words to the spell checker, I didn't
     # want to deal with the overhead of an user interface here
     # so I simply added a list of words to .config/enchant/en_US.dic
     # (local dictionary) I think that is what some other UI would
     # do anyway and now these words are available to other apps.
     self.chkr = SpellChecker("en_US")
     # keep a local copy cached to minimize the number of
     # requests to linkedin and to provide "offline" access
     # to the most recently downloaded version.
     # remember to delete this file after you edit your
     # linkedin profile
     try :
         f = open(self.home + '/.jitresume', 'r');
         profile = f.read();
     except IOError :
         conn = LinkedIn()
         profile = conn.getProfile()
         f = open(self.home + '/.jitresume', 'w');
         f.write(profile)
     finally :
         self.data = cjson.decode(profile)
         f.close()
示例#52
0
def get_tweet_typos(tweet):
    """
    Return a list of typos found in hash_tag string
    :param hash_tags:
    :return:
    """
    typos = []
    chkr = SpellChecker("en_GB",filters=[EmailFilter,URLFilter])
    chkr.set_text(tweet.text)

    for err in chkr:
        typos.append(err.word)

    tweet.typos = json.dumps(typos)

    return typos
示例#53
0
	def smart_search(self, spell_check_string):
		self.spell_check = SpellChecker(self.language)
		errors = self.search_words(spell_check_string)
		if not self.decrypted_list.get(errors):
			self.decrypted_list[errors] = [spell_check_string]
		else:
			self.decrypted_list[errors].append(spell_check_string)
示例#54
0
文件: gui.py 项目: Oire/TWBlue
 def __init__(self, text, dictionary):
  super(spellCheckerDialog, self).__init__(None, 1)
  try:
   if config.main["general"]["language"] == "system": self.checker = SpellChecker()
   else: self.checker = SpellChecker(languageHandler.getLanguage())
   self.checker.set_text(text)
  except DictNotFoundError:
   wx.MessageDialog(None, _(u"A bug has happened. There are no dictionaries available for the selected language in TW Blue"), _(u"Error"), wx.ICON_ERROR).ShowModal()
   self.Destroy()
  panel = wx.Panel(self)
  sizer = wx.BoxSizer(wx.VERTICAL)
  word = wx.StaticText(panel, -1, _(u"Mis-spelled word"))
  self.word = wx.TextCtrl(panel, -1)
  wordBox = wx.BoxSizer(wx.HORIZONTAL)
  wordBox.Add(word)
  wordBox.Add(self.word)
  context = wx.StaticText(panel, -1, _(u"Context"))
  self.context = wx.TextCtrl(panel, -1)
  contextBox = wx.BoxSizer(wx.HORIZONTAL)
  contextBox.Add(context)
  contextBox.Add(self.context)
  suggest = wx.StaticText(panel, -1, _(u"Suggestions"))
  self.suggestions = wx.ListBox(panel, -1, choices=[], style=wx.LB_SINGLE)
  suggestionsBox = wx.BoxSizer(wx.HORIZONTAL)
  suggestionsBox.Add(suggest)
  suggestionsBox.Add(self.suggestions)
  ignore = wx.Button(panel, -1, _(u"Ignore"))
  self.Bind(wx.EVT_BUTTON, self.onIgnore, ignore)
  ignoreAll = wx.Button(panel, -1, _(u"Ignore all"))
  self.Bind(wx.EVT_BUTTON, self.onIgnoreAll, ignoreAll)
  replace = wx.Button(panel, -1, _(u"Replace"))
  self.Bind(wx.EVT_BUTTON, self.onReplace, replace)
  replaceAll = wx.Button(panel, -1, _(u"Replace all"))
  self.Bind(wx.EVT_BUTTON, self.onReplaceAll, replaceAll)
  close = wx.Button(panel, wx.ID_CANCEL)
  btnBox = wx.BoxSizer(wx.HORIZONTAL)
  btnBox.Add(ignore)
  btnBox.Add(ignoreAll)
  btnBox.Add(replace)
  btnBox.Add(replaceAll)
  btnBox.Add(close)
  sizer.Add(wordBox)
  sizer.Add(contextBox)
  sizer.Add(suggestionsBox)
  sizer.Add(btnBox)
  panel.SetSizerAndFit(sizer)
  self.check()
示例#55
0
    def __init__(self, textwidget, **kw):
        lang = kw.pop('language', 'en_US')
        filters = kw.pop('filters', [EmailFilter, URLFilter])
        chunkers = kw.pop('chunkers', (HTMLChunker,))
        self.checker = SpellChecker(lang=lang, filters=filters, chunkers=chunkers)

        self.tw = textwidget
        self.tw.tag_config('sp_err', background="#CCFB5D", underline=False)
示例#56
0
def incorrect_words(request):
    """
    Checks text and returns list of incorrectly spelled words.
    """
    if not settings.SPELLCHECK_ENABLED:
        return HttpResponseBadRequest(_("spell checking not supported"))

    text = request.POST.get("text")

    if not text:
        return HttpResponseBadRequest(_("text not provided"))

    chkr = SpellChecker(settings.LANGUAGE_CODE)
    chkr.set_text(text)
    errors = [err.word for err in chkr]
    data = {"data": [errors]}

    return HttpResponse(json.dumps(data))
示例#57
0
def spellchecker(data):
  tweet = []
  for row in data:
    val = row.split("\t")
    if val[0] == 'Finding0###':
      tweet.append(val[1].strip('\n'))
  tweets = pd.DataFrame()
  tweets['text'] = [tweet for tweet in tweet]
  
  for i in tweets['text']:
    text = i
    chkr = SpellChecker("en_US", text)
    for err in chkr:
        print("{0}\t{1}".format("Findingres6###",err.word + " at position " + str(err.wordpos)))  #<----
        err.replace("SPAM")
    print("Text replaced by spam at  wrong spelling or words")
    t = chkr.get_text()
    print("\n" + t)  #<----
def spellCheck(argument): 

	words =  len(argument)
	abbr = abb1 + abb2 + abb3
	#print words

	errors = 0                             
	chkr = SpellChecker("en_GB", argument)

	for word in argument:
		if chkr.check(word) == False:
			errors+=1
			if (chkr.check(word.upper()) == True) or (chkr.check(word.capitalize()) == True) or (chkr.check(word + ".") == True or (word in abbr)):
				errors-=1                                   

	outcome = (float(errors) / words)

	return outcome