def test_get_posterior(self): motor_cycles_prior = -19.400790000000001 motor_space_cycles_prior = -20.64209 # phrases with splits and errors split_str = 'picture of state trooper motor cycles' joined_str = 'picture of state trooper motorcycles' query = Suggestion(suggestion_str=joined_str) suggestion = Suggestion(suggestion_str=split_str) # edit_distance = (phrase.get_edits(''.join(joined_str.split()), # ''.join(split_str.split()))[0]\ # + 2 * phrase.space_edit_cost) # expected = -(edit_distance / len(str(split_str))) expected = -0.0033333333333333335 likelihood = phrase.get_likelihood(query, suggestion) self.assertAlmostEqual(likelihood, expected) log_split_posterior = motor_space_cycles_prior + phrase.get_likelihood( query, suggestion) log_joined_posterior = motor_cycles_prior + phrase.get_likelihood( query, query) self.assertAlmostEqual(log_split_posterior, -20.645423333333333) self.assertAlmostEqual(log_joined_posterior, -19.400790000000001)
def test_get_all_stats_corner_cases(self): self.spell_checker.get_posterior_fn = self.dummy_posterior_fn query_list = ['yo boyz i am sing song', 'faster and faster edits', 'jack in a bark floor'] query_list = [Suggestion(suggestion_str = query) for query in query_list] self.spell_checker.run_spell_check(query_list) # pprint(self.spell_checker.get_suggestion_dict()) # key's dict value is empty human_dict = { query_list[0]: [], query_list[1]: [Suggestion(['fast', 'an', 'fast', 'edit'])], query_list[2]: [Suggestion(['jack', 'an', 'an', 'bar', 'foo'])], } actual_stats = self.spell_checker.get_all_stats(human_dict) expected_stats = [0.5, 0.66666666666666663, 0.57142857142857151] self.assertEqual(actual_stats, expected_stats) # All keys' dict values are empty human_dict = { query_list[0]: [], query_list[1]: [], query_list[2]: [], } actual_stats = self.spell_checker.get_all_stats(human_dict) expected_stats = [0.0, 0.0, 0.0] self.assertEqual(actual_stats, expected_stats)
def test_run_spell_check(self): # Setting this here so that we don't have to call MS N-gram API self.spell_checker.get_posterior_fn = self.dummy_posterior_fn query_list = [Suggestion(['yo', 'boyz'])] self.spell_checker.run_spell_check(query_list) self.assertEqual( self.spell_checker.generate_suggestions_and_posteriors( Suggestion(suggestion_str = 'yo boyz')), self.spell_checker.suggestion_dict[query_list[0]])
def test_get_all_stats(self): self.spell_checker.get_posterior_fn = self.dummy_posterior_fn query_list = ['yo boyz i am sing song', 'faster and faster edits', 'jack in a bark floor'] query_list = [Suggestion(suggestion_str = query) for query in query_list] self.spell_checker.run_spell_check(query_list) # pprint(self.spell_checker.get_suggestion_dict()) human_dict = { query_list[0]: [Suggestion(['yo', 'boyz', 'am', 'am', 'sing', 'song'])], query_list[1]: [Suggestion(['fast', 'an', 'fast', 'edit'])], query_list[2]: [Suggestion(['jack', 'an', 'an', 'bar', 'foo'])], } actual_stats = self.spell_checker.get_all_stats(human_dict) expected_stats = [0.61111111111111105, 1.0, 0.75862068965517226] self.assertEqual(actual_stats, expected_stats)
def get_human_suggestions(test_label, filename): """Return human_suggestion_dict read from filename. Each of the sentences is like 'Yo boyz i am sing song.' with the capitalization and period preserved. """ f = open(filename, 'r') file_input = [line.strip().split('\t') for line in f] f.close() # print 'file_input', file_input human_suggestion_dict = dict([(Suggestion(suggestion_str = line_elements[0], suggestion_type = test_label[:-1]), [Suggestion(suggestion_str = phrase, suggestion_type = test_label[:-1]) for phrase in line_elements[1:]]) for line_elements in file_input]) return human_suggestion_dict
def __init__(self): self.preprocessing = Preprocessing() self.dictionary = Dictionary() self.cleaner = Cleaner() self.suggestion = Suggestion() self.word_val_in_dic = None self.word_val_in_bigram = None self.word_status_code = self.CODE_PENDING self.textual_status = { 0: 'In_Progress', 1: 'Correct Word', 2: 'Non-Word Error', 3: 'Real-Word Error', 4: 'Real-Word Error Ignored', 5: 'Case Error' }
def generate_candidate_suggestions(self, term_possibilities_list, suggestion_type): """Return list of candidate Suggestions by combining all possibilities. Arguments: - `term_possibilities_list`: list of list of possibilities for each term in the query phrase. """ # suggestion is a tuple, so converting it to a list return [Suggestion(list(suggestion), suggestion_type = suggestion_type) for suggestion in itertools.product(*term_possibilities_list)]
def get_posterior (suggestion, query): """Return P(suggestion | query). Arguments: - `query`: Suggestion object - `suggestion`: Suggestion object """ # TODO: Check whether it is good to call the MS API with a phrase # instead of a sentence with capitalized first word and a period # at the end. phrase_suggestion = Suggestion(suggestion.term_list, suggestion_type = 'phrase') return math.exp(get_prior (str(suggestion)) + get_likelihood (query, suggestion))
def get_output_from_file(test_label, filename): """Return output of spell check from filename. The file contains query TAB suggestion1 TAB posterior1 TAB ... Output: [query_list, suggestion_dict] """ f = open(filename, 'r') file_input = [line.strip().split('\t') for line in f] f.close() # print 'file_input', file_input suggestion_dict = dict((Suggestion(suggestion_str = line_elements[0], suggestion_type = test_label[:-1]), zip ([Suggestion(suggestion_str = suggestion_str, suggestion_type = test_label[:-1]) for suggestion_str in line_elements[1::2]], map(float, line_elements[2::2]))) for line_elements in file_input) query_list = suggestion_dict.keys() # print 'suggestion_dict', suggestion_dict # print 'query_list', query_list return [query_list, suggestion_dict]
def test_generate_suggestions_and_posteriors(self): # Note: All this is with our tiny dummy lexicon query = Suggestion(suggestion_str = 'wheere are yu going') suggestions = self.spell_checker.generate_suggestions_and_posteriors( query, get_posterior_fn = self.dummy_posterior_fn) expected_suggestion_list = [['wheere', 'an', 'yo', 'going'], ['wheere', 'am', 'yo', 'going'], ['wheere', 'bar', 'yo', 'going']] expected_posterior_list = [self.dummy_posterior] * 3 actual_suggestion_list, actual_posterior_list = [list(produced_tuple) for produced_tuple in zip(*suggestions)] self.assertEqual(actual_suggestion_list, expected_suggestion_list) self.assertEqual(actual_posterior_list, expected_posterior_list) query = Suggestion(['yo', 'boyz']) suggestions = self.spell_checker.generate_suggestions_and_posteriors( query, get_posterior_fn = self.dummy_posterior_fn) expected_suggestion_list = [Suggestion(['yo', 'boyz'])] expected_posterior_list = [1.0] actual_suggestion_list, actual_posterior_list = [list(produced_tuple) for produced_tuple in zip(*suggestions)] self.assertEqual(actual_suggestion_list, expected_suggestion_list) self.assertEqual(actual_posterior_list, expected_posterior_list)
def get_inputs(test_label, filename = '../data/words.input'): """Return list of input queries read from filename. Lowercase all the words. If a query is a sentence, remove the period at the end. """ f = open(filename, 'r') query_list = [Suggestion(suggestion_str = line.strip(), suggestion_type = test_label[:-1]) for line in f] f.close() print 'query_list', query_list return query_list
def __build_graph(self, term, depth=0): if depth == self.MAX_DEPTH: return suggestions = Suggestion(term).get_suggestion() for suggestion in suggestions: if " vs " not in suggestion: continue first_word, second_word = suggestion.split(" vs " , maxsplit=1) if second_word not in self.__nodes: self.__nodes.append(second_word) edge = (first_word, second_word) reversed_edge = (second_word, first_word) if edge not in self.__edges and reversed_edge not in self.__edges: self.__edges.append(edge) self.__build_graph(second_word, depth=depth + 1)
def get_corrected_run_on_queries(query, lexicon): """Correct run-on query by splitting run-on words. Return list of phrase/sentence queries with any run-on word split. Assumption: A maximum of max_num_splits words have been joined together. Arguments: - `query`: Suggestion object - `lexicon`: lexicon of the spell checker """ max_num_splits = 3 # print query # List of list of suggestions for each word term_suggestions_list = [ list( itertools.chain(*[ get_splits(word, i, lexicon) for i in xrange(1, max_num_splits + 1) ])) + [[word]] for word in query ] # print 'term_suggestions_list', term_suggestions_list # All term_combos (considering only one word to be a run-on word # at a time) term_combos = [ list(itertools.chain(*tuple_)) for i in xrange(len(query)) for tuple_ in itertools.product([query[:i]], term_suggestions_list[i], [query[i + 1:]]) ] # print 'term_combos', term_combos term_combos.sort() # Remove duplicates # This requires that keys with same value be consecutive (hence sort). term_combos = [key for key, _ in itertools.groupby(term_combos)] # print 'term_combos', term_combos term_combos.remove(query) # print 'term_combos', term_combos return [ Suggestion(term_combo, suggestion_type='sentence' if query.suggestion_type == 'sentence' else 'phrase') for term_combo in term_combos ]
def test_get_likelihood(self): query = Suggestion(['foo']) suggestion = Suggestion(['bar']) expected = phrase.error_penalization * -( phrase.get_edits(suggestion[0], query[0])[0] / len(str(query))) self.assertAlmostEqual(phrase.get_likelihood(query, suggestion), expected, 2) # phrase with splits joined up split_str = 'fo o bar roc ks' joined_str = 'foo bar rocks' query = Suggestion(suggestion_str=split_str) suggestion = Suggestion(suggestion_str=joined_str) expected = phrase.error_penalization * -(2 * phrase.space_edit_cost / len(split_str)) self.assertAlmostEqual(phrase.get_likelihood(query, suggestion), expected) # phrases with joins split up split_str = 'fo o bar roc ks' joined_str = 'foo bar rocks' query = Suggestion(suggestion_str=joined_str) suggestion = Suggestion(suggestion_str=split_str) expected = phrase.error_penalization * -(2 * phrase.space_edit_cost / len(joined_str)) self.assertAlmostEqual(phrase.get_likelihood(query, suggestion), expected) # phrases with splits and errors split_str = 'fo o bcr rxc ks' joined_str = 'foo bar rocks' query = Suggestion(suggestion_str=split_str) suggestion = Suggestion(suggestion_str=joined_str) edit_distance = (phrase.get_edits(''.join(joined_str.split()), ''.join(split_str.split()))[0]\ + 2 * phrase.space_edit_cost) expected = phrase.error_penalization * -(edit_distance / len(str(split_str))) self.assertAlmostEqual(phrase.get_likelihood(query, suggestion), expected)
def get_corrected_split_queries(query, lexicon): """Correct split query by joining words. Return list of word/phrase/sentence queries with the split words joined. Assumption: a word has been split only once. Note: The original query is NOT part of the returned list. Arguments: - `query`: Suggestion object - `lexicon`: lexicon of the spell checker """ # TODO: Should probably check to see if the resultant suggestion # is a word/phrase/suggestion and then set its suggestion_type. # eg. 'No w.' (sentence) -> 'Now.' (word) joined_up_suggestion_list = [ Suggestion(query[:i] + [query[i] + query[i + 1]] + query[i + 2:], suggestion_type=query.suggestion_type) for i in range(len(query) - 1) if lexicon.is_known_word(query[i] + query[i + 1]) ] return joined_up_suggestion_list
def fetch(self): self.driver.get(self.url) self.doAfterPageLoad() for concept in concepts: for query in concept["queries"]: phrase = query["query"].format(concept = concept["name"]) language = query["lang"] if not (self.onlyEnglish and language != "en"): self.doBeforeQuery() q = self.getSearchBox() q.clear() time.sleep(self.waitBeforeQuery) # Type in the query for letter in phrase: q.send_keys(letter) time.sleep(.05) # Extract the suggestions time.sleep(self.waitAfterQuery) suggestions = self.getSuggestions() # Stores the suggestions in the DB for suggestion in suggestions.text.split("\n") : Suggestion(concept=concept["name"] , phrase=phrase , suggestion=suggestion , language=language , country_IP=self.country_IP , service=self.service).save()
Arguments: - `human_suggestion_dict`: """ if query_list == None: query_list = self.query_list if suggestion_dict == None: suggestion_dict = self.suggestion_dict args = [self.query_list, self.suggestion_dict, human_suggestion_dict] return [utils.get_EP(*args), utils.get_ER(*args), self.get_EF1_measure(human_suggestion_dict)] if __name__ == '__main__': spell_checker = SpellChecker() query_list = ['The departments of the institute offer corses, conducted by highly qualified staff.'] query_list = [Suggestion(suggestion_str = query, suggestion_type = 'sentence') for query in query_list] spell_checker.run_spell_check(query_list) print 'spell_checker.get_suggestion_dict()' pprint(spell_checker.get_suggestion_dict()) # human_dict = { query_list[0]: [Suggestion('why this kolaveri'.split())], # query_list[1]: [Suggestion('i am sing song'.split())] } # print 'spell_checker.get_EF1_measure(human_dict)' # pprint(spell_checker.get_EF1_measure(human_dict)) # print 'spell_checker.get_all_stats(human_dict)' # pprint(spell_checker.get_all_stats(human_dict))
if __name__ == "__main__": # print get_edits ("belie", "belive") # print get_posterior (["belie"], ["belive"]) # for suggestion in generate_all_candidate_suggestions ("i can haz cheezburger".split()) : # print suggestion # print get_prior(suggestion) # print get_posterior(suggestion, 'cat aret gonne'.split()) # print " ".join (suggestion) # print get_edits ("sujeet", "usjeet") # pass # # Test the `accepts` decorator # @accepts(list, int, int) # @returns(float) # def average(x, y, z): # return (x[0] + y + z) / 2 # average([13], 10, 15.0) # average([3], 10, 15) s1 = 'The departments of the institute offer horses conducted by highly qualified staff.' s2 = 'The departments of the institute offer courses conducted by highly qualified staff.' print get_prior(s1) print get_prior(s2) sugg1 = Suggestion(suggestion_str = s1, suggestion_type = 'sentence') sugg2 = Suggestion(suggestion_str = s2, suggestion_type = 'sentence') q = Suggestion(suggestion_str = 'The departments of the institute offer corses, conducted by highly qualified staff.', suggestion_type = 'sentence') print get_posterior(sugg1, q) print get_posterior(sugg2, q)
def suggest(jdID, db): _obj = Suggestion(jdID, db) _suggestion = _obj.get_suggestion() return _suggestion
def test_get_likelihood_splits_special(self): # TODO suggestion_likelihood_list = [ (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), 0.0), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0092284029735965143), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), 0.0), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'horses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), 0.0), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0015698587127158554), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0015887726731100226), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'cores', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0015887726731100226), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.007872379721119217), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0079672276695664374), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0079672276695664374), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0091185410334346517), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'courses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.0092284029735965143), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.010714285714285714), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.010843373493975905), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'torses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.010843373493975905), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.010989010989010988), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.011121408711770158), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'curses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.011121408711770158), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.012605042016806723), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.012756909992912829), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.012756909992912829), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.013632791245994799), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.013797041742934496), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corset', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.013797041742934496), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.030612244897959183), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.030981067125645436), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'copses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.030981067125645436), (Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute', 'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.034985422740524783), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.035406933857880504), (Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), Suggestion([ 'the', 'departments', 'of', 'the', 'institute', 'of', 'fer', 'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff' ]), -0.035406933857880504) ] for i, _tuple in enumerate(suggestion_likelihood_list): query, suggestion, likelihood = _tuple # print 'suggestion number: ', i self.assertAlmostEqual(phrase.get_likelihood(query, suggestion), likelihood, 2)
class Lookup: CODE_PENDING = 0 CODE_CORRECT = 1 CODE_NON_WORD_ERR = 2 CODE_REAL_WORD_ERR = 3 CODE_RWE_IGNORED = 4 CODE_CASE_ERR = 5 def __init__(self): self.preprocessing = Preprocessing() self.dictionary = Dictionary() self.cleaner = Cleaner() self.suggestion = Suggestion() self.word_val_in_dic = None self.word_val_in_bigram = None self.word_status_code = self.CODE_PENDING self.textual_status = { 0: 'In_Progress', 1: 'Correct Word', 2: 'Non-Word Error', 3: 'Real-Word Error', 4: 'Real-Word Error Ignored', 5: 'Case Error' } def load_raw_text(self, file_path=None, passage=None): if (file_path == None and passage == None): raise Exception( 'File path or passage should be provided. File path has priority!' ) text = self.cleaner.clean_corpus(in_file=file_path, passage=passage) if (text): text = self.cleaner.bigram_preparation(passage=text) return [ self.preprocessing.fetch_line_words(line, escape_symbols=False) for line in text.split("\n") ] return [] def validate_word(self, word, prev_word): if (self.preprocessing.is_customized_word(word[0])): lemma = word[0] else: lemma = self.preprocessing.fetch_lemmatized_word(word[0], word[1]) suggestions = dict() if (self.word_exists(word[0], lemma)): if (self.word_in_real_place(word, prev_word)): if (self.word_in_correct_case(word[0])): self.word_status_code = self.CODE_CORRECT else: self.word_status_code = self.CODE_CASE_ERR elif (self.word_can_be_real(word, prev_word)): self.word_status_code = self.CODE_RWE_IGNORED else: self.word_status_code = self.CODE_REAL_WORD_ERR else: self.word_status_code = self.CODE_NON_WORD_ERR if (self.word_status_code not in [self.CODE_CORRECT, self.CODE_CASE_ERR]): suggestions = self.suggestion.get_suggestions( word, prev_word, self.word_status_code) if (word[0].lower() in suggestions): self.word_status_code = self.CODE_CORRECT return { 'word': word, 'prev_word': prev_word, 'status': self.word_status_code, 'textual_status': self.textual_status[self.word_status_code], 'lemma': lemma, 'suggestions': suggestions } def word_exists(self, main_word, lemmatized_word): if (self.preprocessing.is_customized_word(main_word)): self.word_val_in_dic = self.dictionary.CASE_BOTH return True self.word_val_in_dic = self.dictionary.get_single_word_from_dic( lemmatized_word) if (self.word_val_in_dic == None and self.dictionary.words_really_different( main_word, lemmatized_word)): self.word_val_in_dic = self.dictionary.add_single_word2dic( main_word, lemmatized_word) return False if self.word_val_in_dic == None else True def word_in_real_place(self, word, prev_word): if (prev_word[0] == None): return True else: self.word_val_in_bigram = self.dictionary.get_single_word_from_bigram( word[0]) if (self.word_val_in_bigram != None and prev_word[0].lower() in self.word_val_in_bigram['prev_words']): return True return False def word_can_be_real(self, word, prev_word): prev_pos = prev_word[0] if self.preprocessing.is_customized_word( prev_word[0]) else prev_word[1] if (self.word_val_in_bigram != None and prev_pos != None and self.word_val_in_bigram['pos'] != None and self.word_val_in_bigram['prev_pos'] != None and word[1].lower() in self.word_val_in_bigram['pos'] and prev_pos.lower() in self.word_val_in_bigram['prev_pos']): return True return False def word_in_correct_case(self, main_word): if (re.match(r"^([A-Z][a-z]+(\-\_)?)+$", main_word)): main_word = main_word.lower() if (re.match(r"^[A-Z]+(s|es)$", main_word)): main_word = re.sub(r"(s|es)$", '', main_word) new_val = self.dictionary.get_single_word_from_dic(main_word) if (new_val != None): self.word_val_in_dic = new_val if (len(main_word) > 1 and not main_word[1:].islower() and not main_word[1:].isupper()): return False if (self.word_val_in_dic == None): return True current_case = self.dictionary.get_word_case(main_word) if (self.word_val_in_dic == self.dictionary.CASE_BOTH or current_case == self.word_val_in_dic): return True return False