示例#1
0
def printCorrect(word):
    word_new = spell(word)
    if word_new != word:
        for i in word:
            pyautogui.press('backspace')
        pyautogui.press('backspace')
        pyautogui.typewrite(word_new + " ")
示例#2
0
 def cleanData(self, records, stemText=False):
     output = []
     recordsChecked = 0
     recordsToCheck = len(records)
     for sentence in records:
         sentence = str(sentence)
         recordsChecked += 1
         sys.stdout.write("\rRecords cleaned : %i / %i" %
                          (recordsChecked, recordsToCheck))
         cleanSentence = ''
         if len(sentence) < 200:
             words = sentence.split(' ')
             for word in words:
                 if len(word) < 12:
                     if word.isalpha():
                         if stemText:
                             cleanSentence += self.st.stem(
                                 spell(word.lower())) + ' '
                         else:
                             cleanSentence += word.lower() + ' '
         if cleanSentence:
             output.append(cleanSentence.strip())
     sys.stdout.write("\n")
     sys.stdout.flush()
     self.cleanedRecords = output
def get_auto_corrected_words(words):
	"""check each word in words to see if there is an autocorrect suggestion, then reutrn if there was an autocorrect, and the final"""
	autocorrected = False
	auto_corrected_words = []
	for word in words:
		if spell(word) != word:
			autocorrected = True
			auto_corrected_words.append(spell(word))
		else:
			auto_corrected_words.append(word)

	if autocorrected == False:
		return False

	else:
		return auto_corrected_words
示例#4
0
def correctList(lines):
    for i in range(len(lines)):
        #print(lines[i])
        sentence = lines[i].strip('\n')
        sentence = sentence.strip('.')
        if h1.match(sentence) or h2.match(sentence):
            continue
        else:
            # print("Sentence is: ", repr(sentence))
            words = sentence.split()
            string = ''
            for word in words:
                if word in locations:
                    string += word + ' '
                elif num.search(word):
                    string += word + ' '
                elif word in punctuations:
                    string += word + ' '
                elif word == '...':
                    string += word + ' '
                else:
                    corrected = spell(word)
                    string += corrected + ' '
        string = string.strip()
        lines[i] = string
    return lines
示例#5
0
def fix_typos(sentence):
	typo_list = []
	split_sent = sentence.split()
	for i, token in enumerate(split_sent):
		if not d.check(token) and str.isalnum(token):
			split_sent[i] = spell(token).lower()
	return ' '.join(split_sent)
示例#6
0
def search(query):
    if request.method == 'POST':
        query = request.form['search_field']
        query = spell(query)
        return redirect(url_for('search', query=query))
    query_ = [query]
    re_item = "^" + query + "$"
    result = [
        i for i, word in enumerate(master_df['name'])
        if re.search(re_item, word, flags=re.IGNORECASE)
    ]
    result += [
        i for i, word in enumerate(master_df['name'])
        if re.search(query, word, flags=re.IGNORECASE)
    ]
    result = [
        i for i, word in enumerate((master_df['title']))
        if re.search(re_item, str(word), flags=re.IGNORECASE)
    ]
    result += [
        i for i, word in enumerate((master_df['title']))
        if re.search(query, str(word), flags=re.IGNORECASE)
    ]
    result = pd.Series(result).drop_duplicates().tolist()
    returned = [{
        "name": master_df['name'].iat[idx],
        "time": master_df['time'].iat[idx],
        "title": (master_df['title'].iat[idx].split("https:", 1))[0],
        "url": master_df['url'].iat[idx]
    } for idx in result]
    return render_template('index.html', query=query, returned=returned)
示例#7
0
def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [
        WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1]))
        for t in pos_tags
    ]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # modify mis-spelled
    text = [spell(reduce_lengthening(t)) for t in text]
    # remove non english and mis-spelled
    text = check_english(text)
    # join all
    text = " ".join(text)
    return (text)
示例#8
0
def imgOcrEng(file_name):
	im = Image.open(file_name)
	text = pytesseract.image_to_string(im, lang='eng')

	fin = open('temp-extracted.txt','w')
	fin.write(text)
	fin.close()

	fhand = open('temp-extracted.txt')
	fout = open('extracted.txt','w')


	for line in fhand:
	    line.rstrip()
	    words=line.split()
	    for word in words:
	        word=spell(word)+' '
	        fout.write(word)
	    fout.write('\n')
	fout.close()

	f = open("extracted.txt", "r")
	text = f.read()
	f.close()

	os.remove("temp-extracted.txt")
	#os.remove("extracted.txt")
	return text
def performSpellCorrection(featureObj):
    checker = SpellChecker("en_US", featureObj.getText())
    for word in checker:
        word.replace(spell(word.word))

    featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text())

    return featureObj
def convert_to_list(oov_string):
    oov_string = oov_string.split("'")
    oov_string = oov_string[1::2]
    for i in range(len(oov_string)):
        print oov_string[i] + ' ',
        oov_string[i] = spell(oov_string[i])
        print oov_string[i]
    return oov_string
示例#11
0
 def autocorrect(self):
     line = ''
     for word in self.sentence.split(' '):
         if word.isdigit():
             line = line + ' ' + word
         else:
             line = line + ' ' + spell(word)
     self.sentence = line
def spell_check(sentence):
    sentence2 = ""
    sentence = sentence.split()
    for i in sentence:
        sentence2 = sentence2 + spell(i) + ' '
    sentence2 = sentence2[:-1]
    print("Sentence after spell check: ", sentence2)
    return sentence2
示例#13
0
 def auto_correct(self, vector):
     self.auto_correct_remaining -= 1
     print('\rSpell auto-correct...',
           self.auto_correct_remaining,
           'sentences remain',
           end='    ',
           flush=True)
     return [spell(word) for word in vector]
    def autocorrect(self):
        line=''
        for word in self.sentence.split(' '):
	    if word.isdigit():
		line=line+' '+word
	    else:
            	line=line+' '+spell(word)
        self.sentence=line
示例#15
0
def do_auto_correct(text):
    final_str = ""
    for x in text.split():
        corrected = spell(x)
        final_str += corrected+" "
    if len(final_str)>2:
        final_str += final_str[:len(final_str)-1]
    return final_str
示例#16
0
def correctMatch(match):
    word = str(match.group())
    try:
        int(word)
        corrected_word = word
    except ValueError:
        corrected_word = spell(word)
    return corrected_word
示例#17
0
def check_one_sentence(sentence):
	new_str = ''
	for word in sentence.split():
		if len(word) == 1 and ord(word[0]) < 65:
			new_str += word + ' '
		else:
			new_str += spell(word) + ' '
	return new_str
示例#18
0
def _AutocorrectAsNeeded(word: str):
    """
    Detect if word needs spell correcting and return correct word
    """
    if word in words.words():
        return word
    else:
        return spell(word)
示例#19
0
def correct_sentence(line):
    lines = line.strip().split(' ')
    new_line = ""
    similar_word = {}
    for l in lines:
        new_line += spell(l) + " "
    # similar_word[l]=spell.candidates(l)
    return new_line
示例#20
0
def get_bot_response():
    query = request.args.get('msg')
    query = [spell(w) for w in (query.split())]
    question = " ".join(query)
    response = k.respond(question)
    if response:
        return str(response)
    else:
        return str(":)")
def spell_correct(search_term):
		search_term  = search_term.split()
		corrected_string = ''
		for term in search_term:
			corrected_term = term
			if term.isalpha():
				corrected_term = spell(term)
			corrected_string += corrected_term+' '
		return corrected_string.strip()
示例#22
0
def autocorrect_spell(word):
    global count_spelling, count_words
    count_words = count_words + 1
    if re.match("[^A-Za-z0-9]+", word):
        if word == '\'t':
            word = 'n\'t'
            return spell(word).lower()
        else:
            return word
    else:
        if word == '\'t':
            word = 'n\'t'
            #print("n't")
        new_string = spell(word).lower()
        if new_string != word:
            count_spelling = count_spelling + 1
            incorrect_words.append(new_string + ',' + word)
        return new_string
示例#23
0
def spell_unrecognized(vocab, tweet):
    words = re.split(r'\s+', tweet)
    new_words = []
    for word in words:
        if word in vocab:
            new_words.append(word)
            continue
        new_words.append(spell(word))
    return ' '.join(new_words)
示例#24
0
文件: run2.py 项目: gmurry/project2
def emphasize(word):
    if (word[:2] == '..'):
        return '...'

    new_word = re.sub(r'(.)\1{2,}', r'\1', word)
    if len(new_word) != len(word):
        return '<<' + spell(new_word) + '>>'
    else:
        return word
 def _correct_word(self, text1):
     pattern = re.compile(r"(.)\1{2,}")
     text2 = pattern.sub(r"\1\1", text1)  # reduce lengthening
     #if text1 != text2:
     #    print(text1, text2)
     text3 = spell(text2).lower()  # spell correction
     #if text2 != text3:
     #    print(text2, text3)
     return text3
def word_correct(tweet):
    tweet = tweet.strip().split()
    for i in range(0, len(tweet)):
        word = tweet[i]
        if not wordnet.synsets(word):
            word = spell(word)
            tweet[i] = word
    tweet = ' '.join(tweet)
    return tweet
示例#27
0
        def evaluate(event):
            a = spell(e.get())

            if (a.isalpha() == True):
                res.configure(text='''Meaning of:  ''' + a + " \n" +
                              str(dic.meaning(a)))
            else:
                tkMessageBox.showinfo("INVALID INPUT",
                                      ''' Please enter appropriate word!!!''')
示例#28
0
        def evaluate(event):
            a = spell(e.get())

            if (a.isalpha() == True):
                res.configure(text='''Synomys of:  ''' + a + " \n" +
                              str(dic.synonym(a)))
            else:
                tkMessageBox.showinfo("INVALID INPUT",
                                      ''' Please enter a word!!!''')
def ocr_core(filename):
    config = ('-l eng --oem 1 --psm 3')
    """
    This function will handle the core OCR processing of images.
    """
    text = pytesseract.image_to_string(
        Image.open(filename), config=config
    )  # We'll use Pillow's Image class to open the image and pytesseract to detect the string in the image
    return spell(text.replace('\n', ''))
示例#30
0
def preprocesstext(text, keyword, hashtag_remove = False, at_usr_remove = False, remove_url = False, replaceSpecialChars = True,
                   replaceNumbers = True, convertToLowerCase = True, removeDefaultStopWords = True, removeGivenWords = False,
                   stemWords = False, lemmatize = False, spellcorrect = False, word_list = []):
    assert isinstance(hashtag_remove,bool)
    assert isinstance(at_usr_remove,bool)
    assert isinstance(replaceSpecialChars,bool)
    assert isinstance(replaceNumbers,bool)
    assert isinstance(convertToLowerCase,bool)
    assert isinstance(removeDefaultStopWords,bool)
    assert isinstance(removeGivenWords,bool)
    assert isinstance(stemWords,bool)
    assert isinstance(lemmatize,bool)
    assert isinstance(spellcorrect,bool)
    assert isinstance(word_list,list) | isinstance(word_list,str)
    if hashtag_remove:
       hashtag = re.findall(r'#([A-Za-z0-9]+)', text)
       if len(hashtag) != 0:
           for i in hashtag:
               rep = '#' + i
               if keyword in i:
                   text = text.replace(rep,' ')
               else:
                   text = text.replace(rep,i)
    if at_usr_remove:
       at_usr = re.findall('@[^\s]+', text)
       for i in at_usr:
           text = text.replace(i,'')
    if remove_url:
       text = re.sub(r'((www\.[^\s]+)|(https://[^\s]+))','',text)
       text = re.sub("http\S+", "", text)
       text = re.sub("https\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s%s]" % re.escape(string.punctuation)," ",text)         # removing characters other than punctuations, numbers and letters
    if replaceSpecialChars:
       text = re.sub(r"[^0-9A-Za-z']", " ", text)
    if replaceNumbers:
       text = re.sub(r"[0-9]", " ",text)
    text = re.sub(r'([[:alpha:]])\1+', r'\1\1',text)   # removing extra instances of characters with more than 2 consecutive occurances
    if convertToLowerCase:
       text = str(text).lower()
    if removeDefaultStopWords:
       stopwords = "(^|\\s)(" + '|'.join(nltk.corpus.stopwords.words('english')) + ")(\\s|$)"
       text = re.sub(stopwords, " ", str(text))
    if removeGivenWords and len(word_list) != 0:
       if type(word_list) == str:
           text = re.sub(word_list, " ", str(text))
       else:
           otherwords = "(^|\\s)(" + str('|'.join(word_list)) + ")(\\s|$)"
           text = re.sub(otherwords, " ", str(text))
    text = re.sub(r"\s+", " ", str(text))    # multiple whitespace characters collapsed to a single blank
    text = str(text).strip()
    if stemWords:
       text = " ".join([stem(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')])
    if lemmatize:
       text = " ".join([lemmatizer.lemmatize(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')])
    if spellcorrect:
       text = " ".join([spell(y) for y in str(text).split() if y not in nltk.corpus.stopwords.words('english')])
    return(text)
示例#31
0
def num_of_missplling(tweet):
    words = tweet.split()
    counter = 0
    # words = speller.unknown(words)
    for word in words:
        if (word.isalpha()):
            if (spell(word) != word):
                counter += 1
    return counter
示例#32
0
def claim_r_claims_present(s1):
    try:
        s1_res = s1.split(' ')
        claim_ind = 0
        claims_ind = 0
        for s in s1_res:
            if spell(str(s)).lower() == "claim":
                claim_ind = 1
            if spell(str(s)).lower() == "claims":
                claims_ind = 1
        if claims_ind == 1:
            return "claims"
        elif claim_ind == 1:
            return "claim"
        else:
            return "NOA"
    except Exception as e:
        print("error in presence of claim or claims function" + str(e))
示例#33
0
def spell_correct(text):
    text = text.split(' ')
    c_text = []
    for word in text:
        if bool(re.match(r'\W', word)) is False:
            word = autocorrect.spell(word)
        c_text.append(word)
    c_text = ' '.join(c_text)
    return c_text
示例#34
0
def spell_check(query):
    """
    Takes an n-gram and fixes spelling
    """
    query_list = query.split()
    for index, q in enumerate(query_list):
        query_list[index] = spell(q.strip())
    suggestion = " ".join(query_list)
    return suggestion.strip()
示例#35
0
def runSpellChecker(reviews):
    stemmer = PorterStemmer()
    index = 0
    for review in reviews:
        for i in range(len(review)):
            review[i] = stemmer.stem(spell(review[i]))
        reviews[index] = review
        index += 1
    return reviews
示例#36
0
def spelltest(tests, verbose=False):
    n, bad, unknown, start = 0, 0, 0, time.clock()
    for target, incorrect_spellings in tests.items():
        for incorrect_spelling in incorrect_spellings.split():
            n += 1
            w = spell(incorrect_spelling)
            if w != target:
                bad += 1
                if not known([target]):
                    unknown += 1
                if verbose:
                    print(MSG.format(incorrect_spelling, w, NLP_COUNTS[w],
                                     target, NLP_COUNTS[target]))
    return RESULT.format(bad, n, int(100. - 100. * bad / n), 
                         unknown, int(time.clock() - start))
示例#37
0
def sample(scores, topics_file='lifelog_qrels/lifelogging_topics_formal.xml', max=50):

    xmldoc = minidom.parse(topics_file)
    topic_nodes = xmldoc.getElementsByTagName('topic')

    queries = ""

    for node in topic_nodes:
        topic = {}
        for tag in node.childNodes:
            if tag.nodeType == tag.ELEMENT_NODE:
                name, value = tag.tagName, tag.childNodes[0].nodeValue
                if name == 'narrative':
                    queries += value

    query_terms = set(string_to_concepts(queries))
    topics = []

    concepts = {}
    terms = []

    with open(os.path.dirname(os.path.abspath(__file__)) + '/data/stopwords.txt') as f:
        stopwords = f.read().split('\n')

    for term in query_terms:
        if term not in stopwords:
            topics.append(term)

    for term in scores:
        if term not in stopwords and term in topics:
            concepts[term] = scores[term]

    sorted_concepts = sorted(concepts.items(), key=lambda x: x[1], reverse=True)

    sorted_terms = [x[0] for x in sorted_concepts]

    indices = gen_log_space(len(sorted_terms), max)
    for i in indices:
        terms.append(sorted_terms[i])

    for i in range(len(terms)):
        terms[i] = spell(terms[i]).lower()
    print(terms, len(terms))
示例#38
0
def phraseSentence(msg):
    msg = msg.split(' ')
    msg_new = []
    for i in msg:
        msg_new.append(autocorrect.spell(i).lower())
    return msg_new
示例#39
0
def spell_stem(txt):
    word_list = [stem(spell(word)) for word in txt]
    word_list = [word for word in word_list if word not in stops]
    return " ".join(word_list)