def correctSpelling(element): # SORT regex for multiple punctuation word = str(element).lower() if ',' in word: replacedPunc = word.replace(',', ' ') elif '/' in word: replacedPunc = word.replace('/', ' ') elif '\\' in word: replacedPunc = word.replace('\\', ' ') elif '.' in word: replacedPunc = word.replace('.', ' ') elif '-' in word: replacedPunc = word.replace('-', ' ') elif '_' in word: replacedPunc = word.replace('_', ' ') else: replacedPunc = word splits = replacedPunc.split() if len(splits) == 1: corrWord = spell.correction(word) elif len(splits) > 0: corrWord = [spell.correction(wrd) for wrd in splits] corrWord = ' '.join(corrWord) else: raise SpellingError( 'Something went terribly wrong here, please investigate') return corrWord
def unit_tests(): assert correction('speling') == 'spelling' # insert assert correction('korrectud') == 'corrected' # replace 2 assert correction('bycycle') == 'bicycle' # replace assert correction('inconvient') == 'inconvenient' # insert 2 assert correction('arrainged') == 'arranged' # delete assert correction('peotry') == 'poetry' # transpose assert correction('peotryy') == 'poetry' # transpose + delete assert correction('word') == 'word' # known assert correction('quintessential') == 'quintessential' # unknown assert words('This is a TEST.') == ['this', 'is', 'a', 'test'] assert Counter(words('This is a test. 123; A TEST this is.')) == (Counter({ '123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2 })) # assert len(WORDS) == 32192 # assert sum(WORDS.values()) == 1115504 assert WORDS.most_common(10) == [('the', 79808), ('of', 40024), ('and', 38311), ('to', 28765), ('in', 22020), ('a', 21124), ('that', 12512), ('he', 12401), ('was', 11410), ('it', 10681)] assert WORDS['the'] == 79808 assert P('quintessential') == 0 assert 0.07 < P('the') < 0.08 return 'unit_tests pass'
def options2(opt, f): if opt.isdigit(): token = opt.split(' ') else: opt = spell.correction(opt.lower()) token = opt.split(' ') checkFlag = 0 print('token is ', token) print('f is ', f) for i in token: for k, j in enumerate(range(len(f))): print('j is ', j) print('i is ', i) print('k is ', k) if len(i) == 1: opt = str(i) # print ('f j ',f[int(opt)-1]) if int(opt) - 1 > len(f): i = 'Rs ' + i print('i from here ', i) else: checkFlag = 1 break if len(i.lower()) >= 2 and i.lower() in f[j].lower(): opt = str(k + 1) checkFlag = 1 break if checkFlag == 1: break print('opt is ', opt) return opt, checkFlag
def tokenize(text): """ Tokenizes given text after normalizing Normalizing steps: 1. Make lowercase 2. Remove links 3. Remove stop words 4. Remove punctuation 5. Lemmatize 6. Make localization 7. Remove non alpha numeric characters 8. Remove digits """ text = Tokenizer.make_lower_case(text) text = Tokenizer.remove_links(text) # text = Tokenizer.remove_stop_words(text) # text = Tokenizer.remove_punctuation(text) tokens = text.split() if config.SPELL_CORRECTION: tokens = [spell.correction(token) for token in tokens] tokens = [lemmatizer.lemmatize(token) for token in tokens] # tokens = [Tokenizer.make_localization(token) for token in tokens] # tokens = [Tokenizer.remove_nonalphanumeric(token) for token in tokens] # tokens = [Tokenizer.remove_digits(token) for token in tokens] return tokens
def cmd_app(): query = prompt.query("Type query: ", validators=[]) query_terms = tokenize(query) updated_query = [] corrected = False for term in query_terms: new_term = correction(term, my_index.WORDS) if new_term != term: corrected = True updated_query.append(new_term) new_query = ' '.join(updated_query) if corrected: print 'Search results for ' + colored.green(new_query) doc_ids = my_index.search(new_query) for doc_id, score in doc_ids[:100]: print '*' * 50 print colored.blue(my_index.files[doc_id]) f = open(dirname + my_index.files[doc_id], 'r') text = f.read() f.close() sent = sent_tokenize(text) with indent(4, quote=''): for s in sent: for q in updated_query: if q in s: puts(s) print
def extract_for_image(extra_path): predicted_list = refactored_main.extract_result(opt, crnn, converter, extra_path) match = np.array([ 1 if prediction.target.lower() == prediction.pred.lower() else 0 for prediction in predicted_list ]) print("Model Prediction : ") model_prediction = " ".join( [prediction.pred.lower() for prediction in predicted_list]) print(model_prediction) print('Accuracy : ' + str(match.mean())) corrected_text = [ correction(prediction.pred.lower()) for prediction in predicted_list ] corrected_match = np.array([ 1 if corrected.lower() == prediction.target.lower() else 0 for corrected, prediction in zip(corrected_text, predicted_list) ]) print("Corrected Prediction : ") corrected_prediction = " ".join(corrected_text) print(corrected_prediction) print('Accuracy : ' + str(corrected_match.mean())) return "\n\n".join(["Original: " + model_prediction, corrected_prediction])
def extract_result(image_index): # get the Predicted List predicted_list = refactored_main.extract_result(opt, crnn, converter, image_mapping[image_index]) match = np.array([ 1 if prediction.target.lower() == prediction.pred.lower() else 0 for prediction in predicted_list ]) print("Model Prediction : ") print(" ".join([prediction.pred.lower() for prediction in predicted_list])) print('Accuracy : ' + str(match.mean())) corrected_text = [ correction(prediction.pred.lower()) for prediction in predicted_list ] # Optimize this. corrected_match = np.array([ 1 if corrected.lower() == prediction.target.lower() else 0 for corrected, prediction in zip(corrected_text, predicted_list) ]) print("Corrected Prediction : ") print(" ".join(corrected_text)) print('Accuracy : ' + str(corrected_match.mean())) # we also need to calculate the accuracy for entries only larger than or equal to 5 terms return " ".join(corrected_text)
def extract_for_image(extra_path): predicted_list = refactored_main.extract_result(opt, crnn, converter, extra_path) match = np.array([ 1 if prediction.target.lower() == prediction.pred.lower() else 0 for prediction in predicted_list ]) print("Model Prediction : ") model_prediction = " ".join( [prediction.pred.lower() for prediction in predicted_list]) print(model_prediction) print('Accuracy : ' + str(match.mean())) corrected_text = [ correction(prediction.pred.lower()) for prediction in predicted_list ] # Optimize this. corrected_match = np.array([ 1 if corrected.lower() == prediction.target.lower() else 0 for corrected, prediction in zip(corrected_text, predicted_list) ]) print("Corrected Prediction : ") corrected_prediction = " ".join(corrected_text) print(corrected_prediction) print('Accuracy : ' + str(corrected_match.mean())) # we also need to calculate the accuracy for entries only larger than or equal to 5 terms return "\n\n".join(["Original: " + model_prediction, corrected_prediction])
def extract_result(image_index): predicted_list = refactored_main.extract_result(opt, crnn, converter, image_mapping[image_index]) match = np.array([ 1 if prediction.target.lower() == prediction.pred.lower() else 0 for prediction in predicted_list ]) print("Model Prediction : ") print(" ".join([prediction.pred.lower() for prediction in predicted_list])) print('Accuracy : ' + str(match.mean())) corrected_text = [ correction(prediction.pred.lower()) for prediction in predicted_list ] corrected_match = np.array([ 1 if corrected.lower() == prediction.target.lower() else 0 for corrected, prediction in zip(corrected_text, predicted_list) ]) print("Corrected Prediction : ") print(" ".join(corrected_text)) print('Accuracy : ' + str(corrected_match.mean())) return " ".join(corrected_text)
def tagNtokenize(strInput, isInput=False): current_dir = dirname(__file__) file_path = join(current_dir, 'sent_tokenizer.pickle') with open(file_path, 'rb') as f: custom_sent_tokenizer = pickle.load(f) file_path = join(current_dir, 'root_dict.pickle') with open(file_path, 'rb') as f: root_dict = pickle.load(f) tokenized = custom_sent_tokenizer.tokenize(strInput) tagged = [] try: for t in tokenized: if (isInput and "PERSON" not in NER([(t, "NN")])): t = spell.correction(t) words2 = nltk.word_tokenize(t) for i in range(0, len(words2) - 1): if (words2[i] in root_dict): words2[i] = root_dict[words2[i]] tagged = nltk.pos_tag(words2) except Exception as e: print(str(e)) return tagged
def correctSpell(): results = [] if 'query' in request.args: query = request.args['query'] else: res = { "results": results } return jsonify(res) item = spell.correction(query) if item=='': results.append({ "id": 0, "text":'', }) else: results.append({ "id": 0, "text":'Did you mean: ' + str(item) + '?', }) res = { "results": results } return jsonify(res)
def getpersonbyid(): #app.logger.debug("JSON received...") #app.logger.debug(request.json) if request.json: mydata = request.json # will be jsontestdata = json.dumps(mydata) #print jsontestdata yamldata = yaml.safe_load(jsontestdata) # yaml used to unicode to str #print yamldata['personId'] #print yamldata #return "Thanks. Your age is %s" % yamldata.get['personId'] domainreturn = yamldata['fetchDomain'] #print domainreturn #return domainreturn #return "json received" urlreturn = yamldata['urlName'] DomainOnlyreturn = yamldata['DomainOnly'] domainURL = urlreturn #domainURL = 'http://facebook.com' domainfetch = domainreturn #print domainfetch a = correction(domainfetch) + '.com' print a saveImage('http://' + a) saveImageUrl(domainURL) NormValue = main('app/static/test1.png', 'app/static/test2.png') print NormValue #print "First" if NormValue == 0: #print "Second" TargetSimValue = DomSimhashTarget( 'www.' + a) # Guess domain that trying to be phished print TargetSimValue AgainstSimValue = DomSimhashAgainst( DomainOnlyreturn ) # Original Phishing website URL. We are trying to detect it print AgainstSimValue if TargetSimValue != AgainstSimValue: print "Phishing Detected" return "Phishing Detected. Please becareful of each action in this page" else: print "Phishing Not Detected" #return "Phishing not detected. Chill" print "Exit time" return "Wait Bro" # Need to Change else: return "no json received"
def main(): miss_spelled_words=[] input_line=sys.argv[1] dict_words_list=load_pickle_file(pkl_file_path) ip_words=input_line.strip().split(' ') uniq_ip_words=set(ip_words) sorted_uniq_ip_words=sorted(list(uniq_ip_words)) miss_spelled_words.append(spellcheck(sorted_uniq_ip_words, dict_words_list)) for word in miss_spelled_words: print "misspelled word found: " + word + " " + ", possible correct word: " + str(spell.correction(word))
def correction(self, word): tmp = str(word) word = correction(word, MSR=self.MSR, distance_weight=self.dw) if word == None: word = self.lex.leven_fit(tmp, area=(self.lex.index(len(tmp)), self.lex.index(len(tmp) + 2))) return word
def checkSynG(user_response): ''' Used to check for goodbye synonyms, as bye is our exit word with spell check ''' for word in user_response.split(): word = correction(word) s = getSynyms(word) for i in range(len(s)): if s[i] in GOODBYE_KEYWORDS: word = s[i] return word return user_response
def greeting(sentence): """If user's input is a greeting, return a greeting response""" for word in sentence.split(): word = correction(word) if word.lower() in GREETING_INPUTS: return random.choice(GREETING_RESPONSES) else: #check for synonyms #check if synonym present in sample responses s = getSynyms(word) for i in range(len(s)): if s[i] in GREETING_INPUTS: return random.choice(GREETING_RESPONSES)
def unit_tests(): assert correction('speling') == 'spelling' # insert assert correction('korrectud') == 'corrected' # replace 2 assert correction('bycycle') == 'bicycle' # replace assert correction('inconvient') == 'inconvenient' # insert 2 assert correction('arrainged') == 'arranged' # delete assert correction('peotry') == 'poetry' # transpose assert correction('peotryy') == 'poetry' # transpose + delete assert correction('word') == 'word' # known assert correction('quintessential') == 'quintessential' # unknown assert words('This is a TEST.') == ['this', 'is', 'a', 'test'] assert get_count( words('This is a test. 123; A TEST this is.')) == (Counter({ '123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2 })) return 'unit_tests pass'
def spelltest(tests, verbose=False): "Run correction(wrong) on all (right, wrong) pairs; report results." import time start = time.clock() good, unknown = 0, 0 n = len(tests) for right, wrong in tests: w = correction(wrong) good += (w == right) if w != right: unknown += (right not in WORDS) if verbose: print('correction({}) => {} ({}); expected {} ({})'.format( wrong, w, WORDS[w], right, WORDS[right])) dt = time.clock() - start print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '. format(good / n, n, unknown / n, n / dt))
def index(): """ Process search request and results. """ result = "<html style='margin:20px 50px'>\n<body><p>   Welcome to<br/>Document Retreival Engine</p>" + form( ) if request.method == 'POST': query_terms = tokenize(request.form['query']) updated_query = [] corrected = False for term in query_terms: new_term = correction(term, my_index.WORDS) if new_term != term: corrected = True updated_query.append(new_term) result += results2string(my_index.search(' '.join(updated_query)), ' '.join(updated_query), corrected) result += "<body></html>" return result
def spell_test(tests, verbose=False): """ Run correction(wrong) on all (right, wrong) pairs. No return statement; just report results. """ good, unknown = 0, 0 n = len(tests) for right, wrong in tests: w = correction(wrong) good += (w == right) if w != right: unknown += (right not in WORDS) if verbose: print('correction({}) => {} ({}); expected {} ({})' .format(wrong, w, WORDS[w], right, WORDS[right])) print('{:.0%} of {} correct ({:.0%} unknown)' .format(good / n, n, unknown / n))
def normalisation(tweet): mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower()) html_removed = re.sub(r'<[^>]+>', '', mention_removed) hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '', html_removed) removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed) normalised_text1 = re.sub(' +', ' ', removed_repeated_chars) normalizr = Normalizr(language='en') normalizations = [ ('replace_urls', { 'replacement': ' ' }), ('replace_punctuation', { 'replacement': ' ' }), ('replace_emojis', { 'replacement': ' ' }), ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), 'remove_accent_marks', 'remove_stop_words', 'remove_extra_whitespaces', ] normalised_text2 = normalizr.normalize(normalised_text1, normalizations) array_words = normalised_text2.split() #print (array_words) normalised_text3 = [correction(word) for word in array_words] normalised_tweet = " ".join(normalised_text3) return normalised_tweet
def options(opt, f): opt = spell.correction(opt.lower()) token = opt.split(' ') checkFlag = 0 # print ('f is ',f) for i in token: for j in f: # print ('j is ',j) # print ('i is ',i) if len(i) == 1 and (int(i) == 1 or int(i) == 2 or int(i) == 3 or int(i) == 4 or int(i) == 5): opt = str(i) checkFlag = 1 break if len(i.lower()) > 2 and i.lower() in j.lower(): opt = str(j) checkFlag = 1 break if checkFlag == 1: break return opt, checkFlag
def createMessage(input): '''Takes json facebook input and creates the message to return to facebook''' #input_msg = TextBlob(input['text']) spellingcheck = str(input).split() counter = 0 for x in spellingcheck: if x not in candidates(x) and x not in Word( "grub").synsets[0].lemma_names() and x not in Word( "booze").synsets[0].lemma_names( ) and x not in MEALS and x not in DRINKS: spellingcheck[counter] = correction(x) print(spellingcheck[counter]) counter += 1 spell_checked_word = " ".join(spellingcheck) print(spell_checked_word) input_msg = TextBlob(spell_checked_word) senderId = 0 data = buildMessage(input_msg, senderId) return str( data) #this will return the wanted message back out to messenger
stop_words = stopwords.words('dutch') #remove stopwords and punctiation keywords = [word for word in tokens if not word in stop_words and not word in string.punctuation] print(f'deze tekst heeft {len(keywords)} aantal woorden') return keywords def get_unknown(keywords): unknown = [word for word in keywords if not word in spell.WORDS] print(f'{len(unknown)} aantal onbekend') return unknown def stem(keywords): stemmer = PorterStemmer() words_stemmed = [stemmer.stem(word) for word in keywords] return words_stemmed def correct(unknown): corrected = with open('Casestudy.txt') as f: text = f.read() text = clean(text) keywords = get_keywords(text) unknown = get_unknown(keywords) corrected = [spell.correction(word) for word in unknown] corpus = keywords + corrected
def get_corrected_search_query(search_query): search_query_terms = search_query.split(" ") search_query_corrected = list(map(lambda x: correction(x), search_query_terms)) return " ".join(search_query_corrected).lower()
def scan(): # Get search result #print(parameters) # query_key, page_rank = parameters.split('&') # query_key = query_key.split('=')[1] # page_rank = page_rank.split('=')[1] query_key = request.args.get('query') page_rank = request.args.get('pagerank') # print(query_key) # print(page_rank) solr = SolrClient('http://localhost:8983/solr') if page_rank == '1': #print('exe1') res = solr.query('myexample',{ 'q':query_key, 'sort':'pageRankFile desc', }) else: #print('exe0') res = solr.query('myexample',{ 'q':query_key, }) if res is None: json_result = {'query':None} return json.dumps(json_result, ensure_ascii=False) else: #print(res) for value in res.docs: #print(value['id']) # Add snippets snippet = get_snippet(value['id'], query_key) value['snippet'] = snippet if 'description' not in value: value['description']='NULL' if 'og_url' not in value: with open('./mapNBCNewsDataFile.csv') as f: key = value['id'].split('/')[-1] for line in f: if line.split(',')[0] == key: value['og_url'] = (line.split(',')[-1]) break # Use Norvig's result to replace the Solr suggestion # correct_res = res.data['spellcheck'] # if correct_res.get('suggestions'): # correct_word = correction(query_key) # res.data['spellcheck']['suggestions'][1]['suggestion'][0]=correct_word correct_res = res.data['spellcheck'] correct_word_list=[] if correct_res.get('suggestions'): query_key_list = query_key.split() for i in query_key_list: # correct_word = correction(query_key) correct_word_list.append(correction(i)) res.data['spellcheck']['collations'][1]=' '.join(correct_word_list) return json.dumps(res.data, ensure_ascii=False,indent=4)
def home(): # log in credentials name = None email = None user_name = email session = request.environ.get('beaker.session') if session['_id'] in local_credentials: # retrieve the name and email from the Google Plus API plus = build('plus', 'v1', credentials=local_credentials[session['_id']]) plus_details = plus.people().get(userId='me').execute() name = str(plus_details['name']['givenName']) + " " + str( plus_details['name']['familyName']) email = str(plus_details['emails'][0]['value']) user_name = email # create a global dictionary, h, to contain the count history of every word global saved_h global h global most_recent # this part of code simply uses pickle to store search history for unique users # it stores this: {'user1' : 'history_of_user1', 'user2' : 'history_of_user2'} # change user_name for unique user id if os.path.getsize('saved_dictionary.pickle') > 0: # load the current dictionary of all users and find the current one with open('saved_dictionary.pickle', 'rb') as dict_file: saved_h = pickle.load(dict_file) if user_name in saved_h: h = saved_h[user_name] else: h = [] else: h = [] saved_h = {} if os.path.getsize('mostrec_dictionary.pickle') > 0: # load the current dictionary of all users and find the current one with open('mostrec_dictionary.pickle', 'rb') as dict_file: most_recent = pickle.load(dict_file) if user_name in most_recent: mr = most_recent[user_name] else: mr = [] else: mr = [] most_recent = {} global searchstring first_word = "" second_word = None third_word = None # parses the input string to generate a dictionary and output results # request to get the value 'keywords' from HTML if (request.params.get('keywords')): searchstring = request.params.get('keywords') # change all input to lowercase searchstring = searchstring.lower() string_to_list = searchstring.split() first_word = string_to_list[0] if len(string_to_list) > 1: second_word = string_to_list[1] if len(string_to_list) > 2: third_word = string_to_list[2] else: searchstring = "" # create a dictionary of tuples d = [] for word in searchstring.split(): if [x for x, y in enumerate(d) if y[0] == word]: index = [x for x, y in enumerate(d) if y[0] == word] tup = d[index[0]] tup[1] = tup[1] + 1 else: d.append([word, 1]) if [x for x, y in enumerate(h) if y[0] == word]: index = [x for x, y in enumerate(h) if y[0] == word] tup = h[index[0]] tup[1] = tup[1] + 1 else: h.append([word, 1]) # update most recent here: try: if (mr.index(word)): mr.remove(word) mr = [word] + mr else: if len(mr) > 20: mr = mr[:-1] mr = [word] + mr except ValueError: if len(mr) > 20: mr = mr[:-1] mr = [word] + mr # display the searched string html_searched_string = "<p1>Search for \"<i>" + searchstring + "</i>\"</p1>" # display the history html_top_20 = "<p1>Top 20 Searched Words</p1>" h_top = sorted(h, key=lambda x: x[1], reverse=True)[:20] # update history dictionary to pickle file saved_h[user_name] = h_top html_most_recent = "<p1>Most Recent 20 Words</p1>" with open('saved_dictionary.pickle', 'wb') as dict_file: pickle.dump(saved_h, dict_file) # update most recent most_recent[user_name] = mr with open('mostrec_dictionary.pickle', 'wb') as dict_file: pickle.dump(most_recent, dict_file) #===================================================================================================== # Fetching from Database implementation with open('lexicon.json', 'r') as f: for item in f: item = json.loads(item) #add words to words_list if not item["word"] in words_list: words_list.append(item["word"]) if item["word"] == first_word: # print first_word keyword_id = item["word_id"] break else: keyword_id = -1 # print keyword_id with open('inverted_index.json', 'r') as f: for item in f: item = json.loads(item) # print type(keyword_id) if item["word_id"] == keyword_id: dict_url_list = item["url_list"] # print dict_url_list break else: dict_url_list = [] # print dict_url_list dict_rank = [] with open('scores.json', 'r') as f: for item in f: item = json.loads(item) dict_rank.append(item) # print dict_rank dict_doc_id = [] with open('document_index.json', 'r') as f: for item in f: item = json.loads(item) dict_doc_id.append(item) # print dict_doc_id dict_combined = [] ''' mongodb.initialize() try: keyword_id = mongodb.getData("lexicon", { "word": first_word })[0]["word_id"] dict_url_list = mongodb.getData("inverted_index", { "word_id": keyword_id })[0]["url_list"] dict_rank = mongodb.getData("scores", {}) dict_doc_id = mongodb.getData("document_index", {}) dict_combined = [] print dict_rank except: dict_url_list = [] dict_rank = [] ''' if len(dict_url_list) > 0: # using dict_rank, get a list of tuples ordered by page rank dict_rank_sorted = sorted(dict_rank, key=lambda k: k['score']) # using dict_rank_sorted, find the url name corresponding to the doc_id # dict_combined is list with INCREASING page rank order for item in dict_rank_sorted: for temp_dict in dict_doc_id: if temp_dict['doc_id'] == item['doc_id'] and item[ 'doc_id'] in dict_url_list: dict_combined.append(temp_dict['url']) # reverse the array dict_combined = list(reversed(dict_combined)) html_pages = '' # if length is greater than max_url_len # implement static pagination if request.params.get('page_no'): page_no = int(request.params.get('page_no')) else: page_no = 1 if (len(dict_url_list) > max_url_len): page_num, past_page_num = divmod(len(dict_combined), max_url_len) if past_page_num != 0: page_num += 1 html_pages = '<div id="content_bot"><div id="page_div" align="center"><div class="pagination">' for page in range(page_num): # this section of code prints maximum 10 pages and when page_no is moved, it moves the pages in the same direction if page + 1 > page_no + 5: pass elif page + 1 < page_no - 5: pass elif page + 1 == page_no: html_pages += '<button class="active" name="page_no" value="' + str( page + 1) + '">' + str(page + 1) + '</button>' else: html_pages += '<button name="page_no" value="' + str( page + 1) + '">' + str(page + 1) + '</button>' html_pages += '</div></div></div>' else: html_pages = '' pass #===================================================================================================== # print the actual content of dict_combined # page_no is given and is used to print the correct urls page, last_page_num = divmod(len(dict_combined), max_url_len) if last_page_num != 0: page += 1 if page_no > page: return error404(404) url_print = "" html_url = '<div id="content_results"><p2>' + str( len(dict_url_list )) + ' Results found for "' + first_word + '"</p2><br><br>' if page == page_no and last_page_num != 0: # we are on the last page with last_page_num elements for i in range(last_page_num): url_print = dict_combined[max_url_len * (page_no - 1) + i] html_url += '<p2><a href="' + url_print + '">' + url_print + '</a></p2></br></br>' else: # we are on some other page; print max_url_len many urls for i in range(max_url_len): url_print = dict_combined[max_url_len * (page_no - 1) + i] html_url += '<p2><a href="' + url_print + '">' + url_print + '</a></p2></br></br>' html_url += '</div></div>' #===================================================================================================== else: html_url = '<div id="content_results"><p2>No results found for "' + first_word + '"</p2></div></div>' #===================================================================================================== # spell correction for whole sentence search_sentence = "" search_sentence_bold = "" for one_word in searchstring.split(): spell_print_word = spell.correction(one_word) if spell_print_word != one_word: search_sentence_bold += '<b>' + spell_print_word + ' </b>' else: search_sentence_bold += spell_print_word + " " search_sentence += spell_print_word + " " spell_print_html = '<div id="content">' if search_sentence.rstrip() != searchstring: spell_print_html = spell_print_html + '<div id="spell_correction"><div class="search_button"><form id="search_word" action="/" method="get"><p3>Did you mean:<button name="keywords" value="' + search_sentence.rstrip( ) + '">' + search_sentence_bold.rstrip( ) + '</button></p3></form></div></div>' #===================================================================================================== #===================================================================================================== # special searched words # date, time, define/definition, synonym/antonym special_html = "" if (first_word == "define" or first_word == "definition" or first_word == "meaning" or first_word == "def") and second_word != None: if second_word == 'of' and third_word != None and define( third_word) != None: special_html = '<div id="special_feature">' + define( third_word) + '</div>' elif define(second_word) != None: special_html = '<div id="special_feature">' + define( second_word) + '</div>' elif (second_word == "definition" or second_word == "define" or second_word == "meaning" or second_word == "def") and first_word != '': if define(first_word) != None: special_html = '<div id="special_feature">' + define( first_word) + '</div>' elif (first_word == "synonym" or first_word == "like") and second_word != None: if second_word == 'of' and third_word != None and syn( third_word) != None: special_html = '<div id="special_feature">' + syn( third_word) + '</div>' elif define(second_word) != None: special_html = '<div id="special_feature">' + syn( second_word) + '</div>' elif (second_word == "synonym" or second_word == "like") and first_word != '': if define(first_word) != None: special_html = '<div id="special_feature">' + syn( first_word) + '</div>' elif (first_word == "antonym" or first_word == "unlike") and second_word != None: if second_word == 'of' and third_word != None and ant( third_word) != None: special_html = '<div id="special_feature">' + ant( third_word) + '</div>' elif define(second_word) != None: special_html = '<div id="special_feature">' + ant( second_word) + '</div>' elif (second_word == "antonym" or second_word == "unlike") and first_word != '': if define(first_word) != None: special_html = '<div id="special_feature">' + ant( first_word) + '</div>' elif first_word == "date" or first_word == "today": special_html = '<div id="special_feature"><p4>' + date( ) + '</p4></div>' elif first_word == "time" or first_word == "now": special_html = '<div id="special_feature"><p4>' + time( ) + '<p5><br>' + date() + '</p5></p4></div>' #===================================================================================================== # if there is a searched string, then output the page with tables if len(searchstring): #===================================================================================================== # remove the searched & history table temporarily for lab 3 # as the sign in feature is not implemented in lab 4, keep the history table commented out #if name is not None: # return template('index_search.tpl',user_name = name, user_email = email) + "<div id=content>" + html_pages + html_searched_string + dic_to_table(d, 'results') + "<br>" + html_top_20 + dic_to_table(h_top, 'history') + "<br>" + html_most_recent + list_to_table(mr,'results') + "</div>" #else: # return template('index_search.tpl',user_name = name, user_email = email) + "<div id=content>" + html_pages + html_searched_string + dic_to_table(d, 'results') + "<br>" #===================================================================================================== # for more than 1 pages, use pagination to display pages if len(dict_rank) > 0 and len(dict_combined) > max_url_len: return template('index_search.tpl', user_name=name, user_email=email, searched_string=searchstring, page_no=page_no) + html_pages + template( 'index_search_end.tpl', user_name=name, user_email=email, searched_string=searchstring, page_no=page_no ) + spell_print_html + special_html + html_url else: # for just one page, don't need to use pagination return template('index_search_nopage.tpl', user_name=name, user_email=email, searched_string=searchstring ) + spell_print_html + special_html + html_url else: # output start page if there is no table to be displayed return template('index_initial.tpl', user_name=name, user_email=email)
def corrector(self): self.corrected = [ spell.correction(token) if not re.match('[0-9]', token) else token for token in self.tokens ]
def index(): keywords = request.query.keywords r_server = redis.Redis(host="localhost", port=6379) if keywords: url = "?" + request.query_string page = request.query.page if not page: # redirect to page 1 url += "&page=1" redirect(url) word_ids = {} words = keywords.split() corrected_words = [] doc_ids = {} # running spellcheck on the query for w in words: corrected = spell.correction(w) if corrected and corrected != w: corrected_words.append((corrected, True)) else: corrected_words.append((w, False)) # multiword search, aggregate all the documents and reorder based on number of appearances doc_scores = {} for w in words: w_id = r_server.get("word:%s:word_id" % w) doc_ids = r_server.zrevrange("word_id:%s:doc_ids" % w_id, 0, -1) for doc in doc_ids: score = r_server.get("doc_id:%s:score" % doc) #note the number of searched words that appear for that doc if doc in doc_scores: doc_scores[doc] = (doc_scores[doc][0], doc_scores[doc][1] + 1) else: doc_scores[doc] = (score, 1) #sort first by # appearances, then by pagerank score sorted_doc_ids = sorted(doc_scores.items(), key=lambda x: (x[1][1], x[1][0]), reverse=True) # else page number exists # strip page number from url url = url.split('&')[0] # get five results, with start offset by page number page = int(page) p_start = (page - 1) * 5 p_end = p_start + 4 zlen = len(sorted_doc_ids) if p_end > zlen: p_end = zlen doc_ids = sorted_doc_ids[p_start:p_end + 1] docs = [ r_server.get("doc_id:%s:doc" % doc_id[0]) for doc_id in doc_ids ] titles = [ r_server.get("doc_id:%s:title" % doc_id[0]) for doc_id in doc_ids ] return template('query_results', page=page, url=url, docs=docs, titles=titles, zlen=zlen, query=keywords, corrected=corrected_words) else: return template('query_page')
def response(user_response, num_drinks): #spellCheck input_words = user_response.split() # convert sentance to a list of words #print(input_words) word_list = [] #spell check each word of the input for word in input_words: word_list.append(correction(word)) corrected_input = ' '.join(word for word in word_list) user_response = corrected_input # Break the message into parts and check if aux verb was used pronoun, noun, adjective, verb = getSpeechParts(user_response) if (checkAux(user_response) == True): return "Enjoy" #num_drinks = chat_log[senderId]['drinks_served'] # Search for a drink in the user input and respond as well as we can drink = noun if drink not in DRINKS: #checks to see if possible noun, is a drink drink = searchForDrink(user_response) if len(user_response) == 1: drink = user_response if drink in DRINKS: if num_drinks > 4: return "You are too drunk I am unable to serve you any more drinks. You can type 'clear' to tell me that you're sober again" #increment drink counter num_drinks = num_drinks + 1 if num_drinks <= 1: return "One {0} coming right up!".format(drink) if num_drinks == 2: return "{0} for you, enjoy.".format(drink) else: return "Here is your {0}! Wow you've already had {1} drinks!".format(drink, num_drinks) #reset drink level for when sober if user_response == "clear": num_drinks = 0 # Look for yes or no responses and respond with a weak hedge if checkForYes(user_response): return random.choice(YES_RESPONSES) if checkForNo(user_response): return random.choice(NO_RESPONSES) # If someone doesn't want anything if noun == "nothing": return "There isn't anything I can get for you? I am a master bar tender. You won't find any better." if pronoun == "what": checkForDrinkRep(user_response) #If we have a noun but no drink, we don't know what they want, so we answer with a question if noun: resp = [] #if pronoun: # resp.append(pronoun) if verb: v = verb[0] if v is not None: resp.append(v) if startsWithVowel(noun): noun_pronoun = "an" else: noun_pronoun = "a" resp.append(noun_pronoun + " " + noun + "?") return " ".join(resp) #If nothing caught, return a hedge return "I am sorry! I don't understand you " + random.choice( HEDGE_RESPONSES)
post = re.sub('[!@#$.,?"--]', '', post) post = re.sub('[/]', ' ', post) #removing some common patterns post = re.sub("\quot", '', post) post = re.sub("<a(.*?)</a>", '', post) post = re.sub( '\s+', ' ', post ).strip() post = post.lower() words = post.split() # remove stop words words = [w for w in words if not w in stops] # run spell check on each word for i in range(0, len(words)): words[i] = spellcheck.correction(words[i]) words = ' '.join(words) # Protect against anything crazy happening. Otherwise, just write to file try: fTarget.write(words + '\n') except UnicodeEncodeError as e: print("Unexpected characters! Skipping!") print(e) # Cleanup fSource.close() fTarget.close() print("Done!")