예제 #1
0
def correctSpelling(element):  # SORT regex for multiple punctuation

    word = str(element).lower()

    if ',' in word:
        replacedPunc = word.replace(',', ' ')
    elif '/' in word:
        replacedPunc = word.replace('/', ' ')
    elif '\\' in word:
        replacedPunc = word.replace('\\', ' ')
    elif '.' in word:
        replacedPunc = word.replace('.', ' ')
    elif '-' in word:
        replacedPunc = word.replace('-', ' ')
    elif '_' in word:
        replacedPunc = word.replace('_', ' ')
    else:
        replacedPunc = word

    splits = replacedPunc.split()

    if len(splits) == 1:
        corrWord = spell.correction(word)
    elif len(splits) > 0:
        corrWord = [spell.correction(wrd) for wrd in splits]
        corrWord = ' '.join(corrWord)
    else:
        raise SpellingError(
            'Something went terribly wrong here, please investigate')
    return corrWord
예제 #2
0
def unit_tests():
    assert correction('speling') == 'spelling'  # insert
    assert correction('korrectud') == 'corrected'  # replace 2
    assert correction('bycycle') == 'bicycle'  # replace
    assert correction('inconvient') == 'inconvenient'  # insert 2
    assert correction('arrainged') == 'arranged'  # delete
    assert correction('peotry') == 'poetry'  # transpose
    assert correction('peotryy') == 'poetry'  # transpose + delete
    assert correction('word') == 'word'  # known
    assert correction('quintessential') == 'quintessential'  # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert Counter(words('This is a test. 123; A TEST this is.')) == (Counter({
        '123':
        1,
        'a':
        2,
        'is':
        2,
        'test':
        2,
        'this':
        2
    }))
    # assert len(WORDS) == 32192
    # assert sum(WORDS.values()) == 1115504
    assert WORDS.most_common(10) == [('the', 79808), ('of', 40024),
                                     ('and', 38311), ('to', 28765),
                                     ('in', 22020), ('a', 21124),
                                     ('that', 12512), ('he', 12401),
                                     ('was', 11410), ('it', 10681)]
    assert WORDS['the'] == 79808
    assert P('quintessential') == 0
    assert 0.07 < P('the') < 0.08
    return 'unit_tests pass'
def options2(opt, f):
    if opt.isdigit():
        token = opt.split(' ')
    else:
        opt = spell.correction(opt.lower())
        token = opt.split(' ')

    checkFlag = 0
    print('token is ', token)
    print('f is ', f)
    for i in token:
        for k, j in enumerate(range(len(f))):
            print('j is ', j)
            print('i is ', i)
            print('k is ', k)
            if len(i) == 1:
                opt = str(i)
                # print ('f j ',f[int(opt)-1])
                if int(opt) - 1 > len(f):
                    i = 'Rs ' + i
                    print('i from here ', i)
                else:
                    checkFlag = 1
                    break
            if len(i.lower()) >= 2 and i.lower() in f[j].lower():
                opt = str(k + 1)
                checkFlag = 1
                break
        if checkFlag == 1:
            break
    print('opt is ', opt)
    return opt, checkFlag
예제 #4
0
    def tokenize(text):
        """
            Tokenizes given text after normalizing
            Normalizing steps:
                1. Make lowercase
                2. Remove links
                3. Remove stop words
                4. Remove punctuation
                5. Lemmatize
                6. Make localization
                7. Remove non alpha numeric characters
                8. Remove digits
        """

        text = Tokenizer.make_lower_case(text)
        text = Tokenizer.remove_links(text)
        # text = Tokenizer.remove_stop_words(text)
        # text = Tokenizer.remove_punctuation(text)

        tokens = text.split()
        if config.SPELL_CORRECTION:
            tokens = [spell.correction(token) for token in tokens]
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # tokens = [Tokenizer.make_localization(token) for token in tokens]
        # tokens = [Tokenizer.remove_nonalphanumeric(token) for token in tokens]
        # tokens = [Tokenizer.remove_digits(token) for token in tokens]

        return tokens
예제 #5
0
def cmd_app():
    query = prompt.query("Type query: ", validators=[])
    query_terms = tokenize(query)
    updated_query = []
    corrected = False
    for term in query_terms:
        new_term = correction(term, my_index.WORDS)
        if new_term != term:
            corrected = True
        updated_query.append(new_term)

    new_query = ' '.join(updated_query)
    if corrected:
        print 'Search results for ' + colored.green(new_query)

    doc_ids = my_index.search(new_query)
    for doc_id, score in doc_ids[:100]:
        print '*' * 50
        print colored.blue(my_index.files[doc_id])
        f = open(dirname + my_index.files[doc_id], 'r')
        text = f.read()
        f.close()
        sent = sent_tokenize(text)
        with indent(4, quote=''):
            for s in sent:
                for q in updated_query:
                    if q in s:
                        puts(s)
        print
예제 #6
0
def extract_for_image(extra_path):
    predicted_list = refactored_main.extract_result(opt, crnn, converter,
                                                    extra_path)

    match = np.array([
        1 if prediction.target.lower() == prediction.pred.lower() else 0
        for prediction in predicted_list
    ])
    print("Model Prediction : ")
    model_prediction = " ".join(
        [prediction.pred.lower() for prediction in predicted_list])
    print(model_prediction)
    print('Accuracy : ' + str(match.mean()))

    corrected_text = [
        correction(prediction.pred.lower()) for prediction in predicted_list
    ]

    corrected_match = np.array([
        1 if corrected.lower() == prediction.target.lower() else 0
        for corrected, prediction in zip(corrected_text, predicted_list)
    ])
    print("Corrected Prediction : ")
    corrected_prediction = " ".join(corrected_text)
    print(corrected_prediction)
    print('Accuracy : ' + str(corrected_match.mean()))

    return "\n\n".join(["Original: " + model_prediction, corrected_prediction])
예제 #7
0
def extract_result(image_index):

    # get the Predicted List
    predicted_list = refactored_main.extract_result(opt, crnn, converter,
                                                    image_mapping[image_index])

    match = np.array([
        1 if prediction.target.lower() == prediction.pred.lower() else 0
        for prediction in predicted_list
    ])
    print("Model Prediction : ")
    print(" ".join([prediction.pred.lower() for prediction in predicted_list]))
    print('Accuracy : ' + str(match.mean()))

    corrected_text = [
        correction(prediction.pred.lower()) for prediction in predicted_list
    ]

    # Optimize this.

    corrected_match = np.array([
        1 if corrected.lower() == prediction.target.lower() else 0
        for corrected, prediction in zip(corrected_text, predicted_list)
    ])
    print("Corrected Prediction : ")
    print(" ".join(corrected_text))
    print('Accuracy : ' + str(corrected_match.mean()))

    # we also need to calculate the accuracy for entries only larger than or equal to 5 terms

    return " ".join(corrected_text)
예제 #8
0
def extract_for_image(extra_path):
    predicted_list = refactored_main.extract_result(opt, crnn, converter,
                                                    extra_path)

    match = np.array([
        1 if prediction.target.lower() == prediction.pred.lower() else 0
        for prediction in predicted_list
    ])
    print("Model Prediction : ")
    model_prediction = " ".join(
        [prediction.pred.lower() for prediction in predicted_list])
    print(model_prediction)
    print('Accuracy : ' + str(match.mean()))

    corrected_text = [
        correction(prediction.pred.lower()) for prediction in predicted_list
    ]

    # Optimize this.

    corrected_match = np.array([
        1 if corrected.lower() == prediction.target.lower() else 0
        for corrected, prediction in zip(corrected_text, predicted_list)
    ])
    print("Corrected Prediction : ")
    corrected_prediction = " ".join(corrected_text)
    print(corrected_prediction)
    print('Accuracy : ' + str(corrected_match.mean()))

    # we also need to calculate the accuracy for entries only larger than or equal to 5 terms

    return "\n\n".join(["Original: " + model_prediction, corrected_prediction])
예제 #9
0
def extract_result(image_index):
    predicted_list = refactored_main.extract_result(opt, crnn, converter,
                                                    image_mapping[image_index])

    match = np.array([
        1 if prediction.target.lower() == prediction.pred.lower() else 0
        for prediction in predicted_list
    ])
    print("Model Prediction : ")
    print(" ".join([prediction.pred.lower() for prediction in predicted_list]))
    print('Accuracy : ' + str(match.mean()))

    corrected_text = [
        correction(prediction.pred.lower()) for prediction in predicted_list
    ]

    corrected_match = np.array([
        1 if corrected.lower() == prediction.target.lower() else 0
        for corrected, prediction in zip(corrected_text, predicted_list)
    ])
    print("Corrected Prediction : ")
    print(" ".join(corrected_text))
    print('Accuracy : ' + str(corrected_match.mean()))

    return " ".join(corrected_text)
예제 #10
0
def tagNtokenize(strInput, isInput=False):
    current_dir = dirname(__file__)
    file_path = join(current_dir, 'sent_tokenizer.pickle')
    with open(file_path, 'rb') as f:
        custom_sent_tokenizer = pickle.load(f)

    file_path = join(current_dir, 'root_dict.pickle')
    with open(file_path, 'rb') as f:
        root_dict = pickle.load(f)

    tokenized = custom_sent_tokenizer.tokenize(strInput)
    tagged = []
    try:
        for t in tokenized:
            if (isInput and "PERSON" not in NER([(t, "NN")])):
                t = spell.correction(t)

            words2 = nltk.word_tokenize(t)

            for i in range(0, len(words2) - 1):
                if (words2[i] in root_dict):
                    words2[i] = root_dict[words2[i]]
            tagged = nltk.pos_tag(words2)
    except Exception as e:
        print(str(e))

    return tagged
예제 #11
0
파일: app.py 프로젝트: Vagus30/spellCheck
def correctSpell():
    results = []
    if 'query' in request.args:
        query = request.args['query']
    else:
        res = {
            "results": results
        }
        return jsonify(res)

    item = spell.correction(query)
    if item=='':
        results.append({
            "id": 0,
            "text":'',
        })
    else:
        results.append({
                    "id": 0,
                    "text":'Did you mean: ' + str(item) + '?',
                })
    res = {
        "results": results
    }
    return jsonify(res)
예제 #12
0
def getpersonbyid():

    #app.logger.debug("JSON received...")
    #app.logger.debug(request.json)

    if request.json:
        mydata = request.json  # will be
        jsontestdata = json.dumps(mydata)
        #print jsontestdata
        yamldata = yaml.safe_load(jsontestdata)  # yaml used to unicode to str
        #print yamldata['personId']
        #print yamldata
        #return "Thanks. Your age is %s" % yamldata.get['personId']
        domainreturn = yamldata['fetchDomain']
        #print domainreturn
        #return domainreturn
        #return "json received"
        urlreturn = yamldata['urlName']
        DomainOnlyreturn = yamldata['DomainOnly']
        domainURL = urlreturn
        #domainURL = 'http://facebook.com'
        domainfetch = domainreturn
        #print domainfetch
        a = correction(domainfetch) + '.com'

        print a

        saveImage('http://' + a)

        saveImageUrl(domainURL)

        NormValue = main('app/static/test1.png', 'app/static/test2.png')

        print NormValue
        #print "First"

        if NormValue == 0:
            #print "Second"
            TargetSimValue = DomSimhashTarget(
                'www.' + a)  # Guess domain that trying to be phished
            print TargetSimValue
            AgainstSimValue = DomSimhashAgainst(
                DomainOnlyreturn
            )  # Original Phishing website URL. We are trying to detect it
            print AgainstSimValue
            if TargetSimValue != AgainstSimValue:
                print "Phishing Detected"
                return "Phishing Detected. Please becareful of each action in this page"
            else:
                print "Phishing Not Detected"
                #return "Phishing not detected. Chill"

        print "Exit time"
        return "Wait Bro"  # Need to Change

    else:
        return "no json received"
예제 #13
0
def main():
	miss_spelled_words=[]
	input_line=sys.argv[1]
	dict_words_list=load_pickle_file(pkl_file_path)	
	ip_words=input_line.strip().split(' ')
 	uniq_ip_words=set(ip_words)
	sorted_uniq_ip_words=sorted(list(uniq_ip_words))
	miss_spelled_words.append(spellcheck(sorted_uniq_ip_words, dict_words_list))	
	for word in miss_spelled_words:
		print "misspelled word found: " + word + " " + ", possible correct word: " + str(spell.correction(word))
예제 #14
0
    def correction(self, word):
        tmp = str(word)
        word = correction(word, MSR=self.MSR, distance_weight=self.dw)

        if word == None:
            word = self.lex.leven_fit(tmp,
                                      area=(self.lex.index(len(tmp)),
                                            self.lex.index(len(tmp) + 2)))

        return word
def checkSynG(user_response):
    ''' Used to check for goodbye synonyms, as bye is our exit word with spell check 
    '''
    for word in user_response.split():
        word = correction(word)
        s = getSynyms(word)
        for i in range(len(s)):
            if s[i] in GOODBYE_KEYWORDS: 
                word = s[i]
                return word 
    return user_response 
def greeting(sentence):
    """If user's input is a greeting, return a greeting response"""
    for word in sentence.split():
        word = correction(word)
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)
        else:
            #check for synonyms 
            #check if synonym present in sample responses 
            s = getSynyms(word)
            for i in range(len(s)):
                if s[i] in GREETING_INPUTS: 
                    return random.choice(GREETING_RESPONSES)
예제 #17
0
def unit_tests():
    assert correction('speling') == 'spelling'  # insert
    assert correction('korrectud') == 'corrected'  # replace 2
    assert correction('bycycle') == 'bicycle'  # replace
    assert correction('inconvient') == 'inconvenient'  # insert 2
    assert correction('arrainged') == 'arranged'  # delete
    assert correction('peotry') == 'poetry'  # transpose
    assert correction('peotryy') == 'poetry'  # transpose + delete
    assert correction('word') == 'word'  # known
    assert correction('quintessential') == 'quintessential'  # unknown
    assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
    assert get_count(
        words('This is a test. 123; A TEST this is.')) == (Counter({
            '123': 1,
            'a': 2,
            'is': 2,
            'test': 2,
            'this': 2
        }))
    return 'unit_tests pass'
예제 #18
0
def spelltest(tests, verbose=False):
    "Run correction(wrong) on all (right, wrong) pairs; report results."
    import time
    start = time.clock()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'.format(
                    wrong, w, WORDS[w], right, WORDS[right]))
    dt = time.clock() - start
    print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '.
          format(good / n, n, unknown / n, n / dt))
예제 #19
0
def index():
    """ Process search request and results. """
    result = "<html style='margin:20px 50px'>\n<body><p>&nbsp&nbsp&nbspWelcome to<br/>Document Retreival Engine</p>" + form(
    )
    if request.method == 'POST':
        query_terms = tokenize(request.form['query'])
        updated_query = []
        corrected = False
        for term in query_terms:
            new_term = correction(term, my_index.WORDS)
            if new_term != term:
                corrected = True
            updated_query.append(new_term)
        result += results2string(my_index.search(' '.join(updated_query)),
                                 ' '.join(updated_query), corrected)
    result += "<body></html>"
    return result
예제 #20
0
def spell_test(tests, verbose=False):
    """
        Run correction(wrong) on all (right, wrong) pairs.
        No return statement; just report results.
    """        
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'
                      .format(wrong, w, WORDS[w], right, WORDS[right]))    
    print('{:.0%} of {} correct ({:.0%} unknown)'
          .format(good / n, n, unknown / n))
def normalisation(tweet):
    mention_removed = re.sub(r'(?:@[\w_]+)', '', tweet.lower())
    html_removed = re.sub(r'<[^>]+>', '', mention_removed)
    hashtag_removed = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", '',
                             html_removed)
    removed_repeated_chars = re.sub(r'(.)\1+', r'\1\1', hashtag_removed)
    normalised_text1 = re.sub(' +', ' ', removed_repeated_chars)

    normalizr = Normalizr(language='en')

    normalizations = [
        ('replace_urls', {
            'replacement': ' '
        }),
        ('replace_punctuation', {
            'replacement': ' '
        }),
        ('replace_emojis', {
            'replacement': ' '
        }),
        ('replace_hyphens', {
            'replacement': ' '
        }),
        ('replace_symbols', {
            'replacement': ' '
        }),
        'remove_accent_marks',
        'remove_stop_words',
        'remove_extra_whitespaces',
    ]

    normalised_text2 = normalizr.normalize(normalised_text1, normalizations)
    array_words = normalised_text2.split()
    #print (array_words)

    normalised_text3 = [correction(word) for word in array_words]
    normalised_tweet = " ".join(normalised_text3)

    return normalised_tweet
def options(opt, f):
    opt = spell.correction(opt.lower())
    token = opt.split(' ')
    checkFlag = 0

    # print ('f is ',f)
    for i in token:
        for j in f:
            # print ('j is ',j)
            # print ('i is ',i)
            if len(i) == 1 and (int(i) == 1 or int(i) == 2 or int(i) == 3
                                or int(i) == 4 or int(i) == 5):
                opt = str(i)
                checkFlag = 1
                break
            if len(i.lower()) > 2 and i.lower() in j.lower():
                opt = str(j)
                checkFlag = 1
                break
        if checkFlag == 1:
            break
    return opt, checkFlag
예제 #23
0
def createMessage(input):
    '''Takes json facebook input and creates the message to return to facebook'''
    #input_msg = TextBlob(input['text'])

    spellingcheck = str(input).split()
    counter = 0
    for x in spellingcheck:

        if x not in candidates(x) and x not in Word(
                "grub").synsets[0].lemma_names() and x not in Word(
                    "booze").synsets[0].lemma_names(
                    ) and x not in MEALS and x not in DRINKS:
            spellingcheck[counter] = correction(x)
            print(spellingcheck[counter])
        counter += 1
    spell_checked_word = " ".join(spellingcheck)
    print(spell_checked_word)
    input_msg = TextBlob(spell_checked_word)

    senderId = 0
    data = buildMessage(input_msg, senderId)

    return str(
        data)  #this will return the wanted message back out to messenger
예제 #24
0
    stop_words = stopwords.words('dutch') 
    #remove stopwords and punctiation
    keywords = [word for word in tokens if not word in stop_words and not word in string.punctuation]
    print(f'deze tekst heeft {len(keywords)} aantal woorden')
    return keywords

def get_unknown(keywords):
    unknown = [word for word in keywords if not word in spell.WORDS]
    print(f'{len(unknown)} aantal onbekend')
    return unknown

def stem(keywords):
    stemmer = PorterStemmer() 
    words_stemmed = [stemmer.stem(word) for word in keywords]
    return words_stemmed

def correct(unknown):
    corrected = 


with open('Casestudy.txt') as f:
    text = f.read()
    text = clean(text)
    keywords = get_keywords(text)
    unknown = get_unknown(keywords)
    corrected = [spell.correction(word) for word in unknown]
    corpus = keywords + corrected



예제 #25
0
파일: app.py 프로젝트: saketkc/hatex
def get_corrected_search_query(search_query):
    search_query_terms = search_query.split(" ")
    search_query_corrected = list(map(lambda x: correction(x), search_query_terms))
    return " ".join(search_query_corrected).lower()
예제 #26
0
def scan():
    # Get search result
    #print(parameters)
    # query_key, page_rank = parameters.split('&')
    # query_key = query_key.split('=')[1]
    # page_rank = page_rank.split('=')[1]
    query_key = request.args.get('query')
    page_rank = request.args.get('pagerank')
    # print(query_key)
    # print(page_rank)
    solr = SolrClient('http://localhost:8983/solr')
    if page_rank == '1':
        #print('exe1')
        res = solr.query('myexample',{
            'q':query_key,
            'sort':'pageRankFile desc',
        })
    else:
        #print('exe0')
        res = solr.query('myexample',{
            'q':query_key,
            })
    if res is None:
        json_result = {'query':None}
        return json.dumps(json_result, ensure_ascii=False)
    else:
        #print(res)
        for value in res.docs:
            #print(value['id'])

            # Add snippets
            snippet = get_snippet(value['id'], query_key)
            value['snippet'] = snippet

            if 'description' not in value:
                value['description']='NULL'
            if 'og_url' not in value:
                with open('./mapNBCNewsDataFile.csv') as f:
                    key = value['id'].split('/')[-1]
                    for line in f:
                        if line.split(',')[0] == key:
                            value['og_url'] = (line.split(',')[-1])
                            break

        # Use Norvig's result to replace the Solr suggestion
        # correct_res = res.data['spellcheck']
        # if correct_res.get('suggestions'):
        #     correct_word = correction(query_key)
        #     res.data['spellcheck']['suggestions'][1]['suggestion'][0]=correct_word
        correct_res = res.data['spellcheck']
        correct_word_list=[]
        if correct_res.get('suggestions'):
            query_key_list = query_key.split()
            
            for i in query_key_list:
                # correct_word = correction(query_key)
                correct_word_list.append(correction(i))
            res.data['spellcheck']['collations'][1]=' '.join(correct_word_list)


        return json.dumps(res.data, ensure_ascii=False,indent=4)
예제 #27
0
def home():
    # log in credentials
    name = None
    email = None
    user_name = email
    session = request.environ.get('beaker.session')
    if session['_id'] in local_credentials:
        # retrieve the name and email from the Google Plus API
        plus = build('plus',
                     'v1',
                     credentials=local_credentials[session['_id']])
        plus_details = plus.people().get(userId='me').execute()
        name = str(plus_details['name']['givenName']) + " " + str(
            plus_details['name']['familyName'])
        email = str(plus_details['emails'][0]['value'])
        user_name = email

    # create a global dictionary, h, to contain the count history of every word
    global saved_h
    global h
    global most_recent

    # this part of code simply uses pickle to store search history for unique users
    # it stores this: {'user1' : 'history_of_user1', 'user2' : 'history_of_user2'}
    # change user_name for unique user id
    if os.path.getsize('saved_dictionary.pickle') > 0:
        # load the current dictionary of all users and find the current one
        with open('saved_dictionary.pickle', 'rb') as dict_file:
            saved_h = pickle.load(dict_file)

        if user_name in saved_h:
            h = saved_h[user_name]
        else:
            h = []
    else:
        h = []
        saved_h = {}

    if os.path.getsize('mostrec_dictionary.pickle') > 0:
        # load the current dictionary of all users and find the current one
        with open('mostrec_dictionary.pickle', 'rb') as dict_file:
            most_recent = pickle.load(dict_file)

        if user_name in most_recent:
            mr = most_recent[user_name]
        else:
            mr = []
    else:
        mr = []
        most_recent = {}

    global searchstring
    first_word = ""
    second_word = None
    third_word = None

    # parses the input string to generate a dictionary and output results
    # request to get the value 'keywords' from HTML
    if (request.params.get('keywords')):
        searchstring = request.params.get('keywords')

        # change all input to lowercase
        searchstring = searchstring.lower()
        string_to_list = searchstring.split()
        first_word = string_to_list[0]
        if len(string_to_list) > 1:
            second_word = string_to_list[1]
            if len(string_to_list) > 2:
                third_word = string_to_list[2]
    else:
        searchstring = ""

    # create a dictionary of tuples
    d = []

    for word in searchstring.split():
        if [x for x, y in enumerate(d) if y[0] == word]:
            index = [x for x, y in enumerate(d) if y[0] == word]
            tup = d[index[0]]
            tup[1] = tup[1] + 1
        else:
            d.append([word, 1])
        if [x for x, y in enumerate(h) if y[0] == word]:
            index = [x for x, y in enumerate(h) if y[0] == word]
            tup = h[index[0]]
            tup[1] = tup[1] + 1
        else:
            h.append([word, 1])
        # update most recent here:
        try:
            if (mr.index(word)):
                mr.remove(word)
                mr = [word] + mr
            else:
                if len(mr) > 20:
                    mr = mr[:-1]
                mr = [word] + mr
        except ValueError:
            if len(mr) > 20:
                mr = mr[:-1]
            mr = [word] + mr

    # display the searched string
    html_searched_string = "<p1>Search for \"<i>" + searchstring + "</i>\"</p1>"

    # display the history
    html_top_20 = "<p1>Top 20 Searched Words</p1>"
    h_top = sorted(h, key=lambda x: x[1], reverse=True)[:20]

    # update history dictionary to pickle file
    saved_h[user_name] = h_top
    html_most_recent = "<p1>Most Recent 20 Words</p1>"

    with open('saved_dictionary.pickle', 'wb') as dict_file:
        pickle.dump(saved_h, dict_file)

    # update most recent
    most_recent[user_name] = mr

    with open('mostrec_dictionary.pickle', 'wb') as dict_file:
        pickle.dump(most_recent, dict_file)

    #=====================================================================================================
    # Fetching from Database implementation
    with open('lexicon.json', 'r') as f:
        for item in f:
            item = json.loads(item)

            #add words to words_list
            if not item["word"] in words_list:
                words_list.append(item["word"])

            if item["word"] == first_word:
                # print first_word
                keyword_id = item["word_id"]
                break
            else:
                keyword_id = -1
    # print keyword_id

    with open('inverted_index.json', 'r') as f:
        for item in f:
            item = json.loads(item)
            # print type(keyword_id)
            if item["word_id"] == keyword_id:
                dict_url_list = item["url_list"]
                # print dict_url_list
                break
            else:
                dict_url_list = []
    # print dict_url_list

    dict_rank = []
    with open('scores.json', 'r') as f:
        for item in f:
            item = json.loads(item)
            dict_rank.append(item)
    # print dict_rank

    dict_doc_id = []
    with open('document_index.json', 'r') as f:
        for item in f:
            item = json.loads(item)
            dict_doc_id.append(item)
    # print dict_doc_id

    dict_combined = []
    '''
    mongodb.initialize()
    try:
        keyword_id = mongodb.getData("lexicon", { "word": first_word })[0]["word_id"]
        dict_url_list = mongodb.getData("inverted_index", { "word_id": keyword_id })[0]["url_list"]
        dict_rank = mongodb.getData("scores", {})
        dict_doc_id = mongodb.getData("document_index", {})
        dict_combined = []
        print dict_rank
    except:
        dict_url_list = []
        dict_rank = []
    '''

    if len(dict_url_list) > 0:
        # using dict_rank, get a list of tuples ordered by page rank
        dict_rank_sorted = sorted(dict_rank, key=lambda k: k['score'])
        # using dict_rank_sorted, find the url name corresponding to the doc_id
        # dict_combined is list with INCREASING page rank order
        for item in dict_rank_sorted:
            for temp_dict in dict_doc_id:
                if temp_dict['doc_id'] == item['doc_id'] and item[
                        'doc_id'] in dict_url_list:
                    dict_combined.append(temp_dict['url'])

        # reverse the array
        dict_combined = list(reversed(dict_combined))

        html_pages = ''

        # if length is greater than max_url_len
        # implement static pagination
        if request.params.get('page_no'):
            page_no = int(request.params.get('page_no'))
        else:
            page_no = 1

        if (len(dict_url_list) > max_url_len):
            page_num, past_page_num = divmod(len(dict_combined), max_url_len)
            if past_page_num != 0:
                page_num += 1

            html_pages = '<div id="content_bot"><div id="page_div" align="center"><div class="pagination">'
            for page in range(page_num):
                # this section of code prints maximum 10 pages and when page_no is moved, it moves the pages in the same direction
                if page + 1 > page_no + 5:
                    pass
                elif page + 1 < page_no - 5:
                    pass
                elif page + 1 == page_no:
                    html_pages += '<button class="active" name="page_no" value="' + str(
                        page + 1) + '">' + str(page + 1) + '</button>'
                else:
                    html_pages += '<button name="page_no" value="' + str(
                        page + 1) + '">' + str(page + 1) + '</button>'
            html_pages += '</div></div></div>'
        else:
            html_pages = ''
            pass

        #=====================================================================================================
        # print the actual content of dict_combined
        # page_no is given and is used to print the correct urls
        page, last_page_num = divmod(len(dict_combined), max_url_len)

        if last_page_num != 0:
            page += 1

        if page_no > page:
            return error404(404)

        url_print = ""
        html_url = '<div id="content_results"><p2>' + str(
            len(dict_url_list
                )) + ' Results found for "' + first_word + '"</p2><br><br>'
        if page == page_no and last_page_num != 0:
            # we are on the last page with last_page_num elements
            for i in range(last_page_num):
                url_print = dict_combined[max_url_len * (page_no - 1) + i]
                html_url += '<p2><a href="' + url_print + '">' + url_print + '</a></p2></br></br>'
        else:
            # we are on some other page; print max_url_len many urls
            for i in range(max_url_len):
                url_print = dict_combined[max_url_len * (page_no - 1) + i]
                html_url += '<p2><a href="' + url_print + '">' + url_print + '</a></p2></br></br>'

        html_url += '</div></div>'
        #=====================================================================================================
    else:
        html_url = '<div id="content_results"><p2>No results found for "' + first_word + '"</p2></div></div>'
    #=====================================================================================================
    # spell correction for whole sentence
    search_sentence = ""
    search_sentence_bold = ""
    for one_word in searchstring.split():
        spell_print_word = spell.correction(one_word)
        if spell_print_word != one_word:
            search_sentence_bold += '<b>' + spell_print_word + ' </b>'
        else:
            search_sentence_bold += spell_print_word + " "
        search_sentence += spell_print_word + " "
    spell_print_html = '<div id="content">'
    if search_sentence.rstrip() != searchstring:
        spell_print_html = spell_print_html + '<div id="spell_correction"><div class="search_button"><form id="search_word" action="/" method="get"><p3>Did you mean:<button name="keywords" value="' + search_sentence.rstrip(
        ) + '">' + search_sentence_bold.rstrip(
        ) + '</button></p3></form></div></div>'
    #=====================================================================================================

    #=====================================================================================================
    # special searched words
    # date, time, define/definition, synonym/antonym
    special_html = ""
    if (first_word == "define" or first_word == "definition" or first_word
            == "meaning" or first_word == "def") and second_word != None:
        if second_word == 'of' and third_word != None and define(
                third_word) != None:
            special_html = '<div id="special_feature">' + define(
                third_word) + '</div>'
        elif define(second_word) != None:
            special_html = '<div id="special_feature">' + define(
                second_word) + '</div>'
    elif (second_word == "definition" or second_word == "define" or second_word
          == "meaning" or second_word == "def") and first_word != '':
        if define(first_word) != None:
            special_html = '<div id="special_feature">' + define(
                first_word) + '</div>'
    elif (first_word == "synonym"
          or first_word == "like") and second_word != None:
        if second_word == 'of' and third_word != None and syn(
                third_word) != None:
            special_html = '<div id="special_feature">' + syn(
                third_word) + '</div>'
        elif define(second_word) != None:
            special_html = '<div id="special_feature">' + syn(
                second_word) + '</div>'
    elif (second_word == "synonym"
          or second_word == "like") and first_word != '':
        if define(first_word) != None:
            special_html = '<div id="special_feature">' + syn(
                first_word) + '</div>'
    elif (first_word == "antonym"
          or first_word == "unlike") and second_word != None:
        if second_word == 'of' and third_word != None and ant(
                third_word) != None:
            special_html = '<div id="special_feature">' + ant(
                third_word) + '</div>'
        elif define(second_word) != None:
            special_html = '<div id="special_feature">' + ant(
                second_word) + '</div>'
    elif (second_word == "antonym"
          or second_word == "unlike") and first_word != '':
        if define(first_word) != None:
            special_html = '<div id="special_feature">' + ant(
                first_word) + '</div>'
    elif first_word == "date" or first_word == "today":
        special_html = '<div id="special_feature"><p4>' + date(
        ) + '</p4></div>'
    elif first_word == "time" or first_word == "now":
        special_html = '<div id="special_feature"><p4>' + time(
        ) + '<p5><br>' + date() + '</p5></p4></div>'
    #=====================================================================================================

    # if there is a searched string, then output the page with tables
    if len(searchstring):
        #=====================================================================================================
        # remove the searched & history table temporarily for lab 3
        # as the sign in feature is not implemented in lab 4, keep the history table commented out

        #if name is not None:
        #    return template('index_search.tpl',user_name = name, user_email = email) + "<div id=content>" + html_pages + html_searched_string + dic_to_table(d, 'results') + "<br>" + html_top_20 + dic_to_table(h_top, 'history') + "<br>" + html_most_recent + list_to_table(mr,'results') + "</div>"
        #else:
        #    return template('index_search.tpl',user_name = name, user_email = email) + "<div id=content>" + html_pages + html_searched_string + dic_to_table(d, 'results') + "<br>"
        #=====================================================================================================

        # for more than 1 pages, use pagination to display pages
        if len(dict_rank) > 0 and len(dict_combined) > max_url_len:
            return template('index_search.tpl',
                            user_name=name,
                            user_email=email,
                            searched_string=searchstring,
                            page_no=page_no) + html_pages + template(
                                'index_search_end.tpl',
                                user_name=name,
                                user_email=email,
                                searched_string=searchstring,
                                page_no=page_no
                            ) + spell_print_html + special_html + html_url
        else:
            # for just one page, don't need to use pagination
            return template('index_search_nopage.tpl',
                            user_name=name,
                            user_email=email,
                            searched_string=searchstring
                            ) + spell_print_html + special_html + html_url

    else:
        # output start page if there is no table to be displayed
        return template('index_initial.tpl', user_name=name, user_email=email)
예제 #28
0
 def corrector(self):
     self.corrected = [
         spell.correction(token) if not re.match('[0-9]', token) else token
         for token in self.tokens
     ]
예제 #29
0
파일: frontend.py 프로젝트: lAmCarl/io
def index():
    keywords = request.query.keywords

    r_server = redis.Redis(host="localhost", port=6379)
    if keywords:
        url = "?" + request.query_string
        page = request.query.page
        if not page:
            # redirect to page 1
            url += "&page=1"
            redirect(url)

        word_ids = {}
        words = keywords.split()
        corrected_words = []
        doc_ids = {}

        # running spellcheck on the query
        for w in words:
            corrected = spell.correction(w)
            if corrected and corrected != w:
                corrected_words.append((corrected, True))
            else:
                corrected_words.append((w, False))

        # multiword search, aggregate all the documents and reorder based on number of appearances
        doc_scores = {}
        for w in words:
            w_id = r_server.get("word:%s:word_id" % w)
            doc_ids = r_server.zrevrange("word_id:%s:doc_ids" % w_id, 0, -1)
            for doc in doc_ids:
                score = r_server.get("doc_id:%s:score" % doc)
                #note the number of searched words that appear for that doc
                if doc in doc_scores:
                    doc_scores[doc] = (doc_scores[doc][0],
                                       doc_scores[doc][1] + 1)
                else:
                    doc_scores[doc] = (score, 1)

        #sort first by # appearances, then by pagerank score
        sorted_doc_ids = sorted(doc_scores.items(),
                                key=lambda x: (x[1][1], x[1][0]),
                                reverse=True)

        # else page number exists
        # strip page number from url
        url = url.split('&')[0]
        # get five results, with start offset by page number
        page = int(page)
        p_start = (page - 1) * 5
        p_end = p_start + 4
        zlen = len(sorted_doc_ids)
        if p_end > zlen:
            p_end = zlen
        doc_ids = sorted_doc_ids[p_start:p_end + 1]
        docs = [
            r_server.get("doc_id:%s:doc" % doc_id[0]) for doc_id in doc_ids
        ]
        titles = [
            r_server.get("doc_id:%s:title" % doc_id[0]) for doc_id in doc_ids
        ]
        return template('query_results',
                        page=page,
                        url=url,
                        docs=docs,
                        titles=titles,
                        zlen=zlen,
                        query=keywords,
                        corrected=corrected_words)
    else:
        return template('query_page')
def response(user_response, num_drinks):
    
    #spellCheck 
    input_words = user_response.split() # convert sentance to a list of words
    #print(input_words)
    word_list = []
    #spell check each word of the input
    for word in input_words:
        word_list.append(correction(word))
    corrected_input = ' '.join(word for word in word_list)
    user_response = corrected_input

    # Break the message into parts and check if aux verb was used 
    pronoun, noun, adjective, verb = getSpeechParts(user_response)
    if (checkAux(user_response) == True):
        return "Enjoy"

    #num_drinks = chat_log[senderId]['drinks_served']
    # Search for a drink in the user input and respond as well as we can
    drink = noun
    if drink not in DRINKS: #checks to see if possible noun, is a drink 
        drink = searchForDrink(user_response)
    if len(user_response) == 1:
        drink = user_response
    if drink in DRINKS:
        if num_drinks > 4:
            return "You are too drunk I am unable to serve you any more drinks. You can type 'clear' to tell me that you're sober again"
        #increment drink counter
        num_drinks = num_drinks + 1
        if num_drinks <= 1:
            return "One {0} coming right up!".format(drink)
        if num_drinks == 2:
            return "{0} for you, enjoy.".format(drink)
        else:
            return "Here is your {0}! Wow you've already had {1} drinks!".format(drink, num_drinks)
 
    #reset drink level for when sober 
    if user_response == "clear":
        num_drinks = 0 
        
    # Look for yes or no responses and respond with a weak hedge
    if checkForYes(user_response):
        return random.choice(YES_RESPONSES)
    if checkForNo(user_response):
        return random.choice(NO_RESPONSES)

    # If someone doesn't want anything
    if noun == "nothing":
        return "There isn't anything I can get for you? I am a master bar tender. You won't find any better."
    
    if  pronoun == "what": 
        checkForDrinkRep(user_response)
        

    #If we have a noun but no drink, we don't know what they want, so we answer with a question
    if noun:
        resp = []
        #if pronoun:
         #   resp.append(pronoun)
        if verb:
            v = verb[0]
            if v is not None:
                resp.append(v)
        if startsWithVowel(noun):
            noun_pronoun = "an"
        else:
            noun_pronoun = "a"
        resp.append(noun_pronoun + " " + noun + "?")
        return " ".join(resp)
    #If nothing caught, return a hedge
    return "I am sorry! I don't understand you " + random.choice( HEDGE_RESPONSES)
예제 #31
0
    post = re.sub('[!@#$.,?"--]', '', post)
    post = re.sub('[/]', ' ', post)

    #removing some common patterns
    post = re.sub("\quot", '', post)
    post = re.sub("<a(.*?)</a>", '', post)
    post = re.sub( '\s+', ' ', post ).strip()
    post = post.lower()
    words = post.split()

    # remove stop words
    words = [w for w in words if not w in stops]

    # run spell check on each word
    for i in range(0, len(words)):
        words[i] = spellcheck.correction(words[i])

    words = ' '.join(words)

    # Protect against anything crazy happening. Otherwise, just write to file
    try:
        fTarget.write(words + '\n')
    except UnicodeEncodeError as e:
        print("Unexpected characters! Skipping!")
        print(e)

# Cleanup
fSource.close()
fTarget.close()
print("Done!")