Пример #1
0
    def test_get_posterior(self):
        motor_cycles_prior = -19.400790000000001
        motor_space_cycles_prior = -20.64209
        # phrases with splits and errors
        split_str = 'picture of state trooper motor cycles'
        joined_str = 'picture of state trooper motorcycles'

        query = Suggestion(suggestion_str=joined_str)
        suggestion = Suggestion(suggestion_str=split_str)
        # edit_distance = (phrase.get_edits(''.join(joined_str.split()),
        #                                   ''.join(split_str.split()))[0]\
        #                                   + 2 * phrase.space_edit_cost)
        # expected = -(edit_distance / len(str(split_str)))

        expected = -0.0033333333333333335

        likelihood = phrase.get_likelihood(query, suggestion)
        self.assertAlmostEqual(likelihood, expected)

        log_split_posterior = motor_space_cycles_prior + phrase.get_likelihood(
            query, suggestion)

        log_joined_posterior = motor_cycles_prior + phrase.get_likelihood(
            query, query)
        self.assertAlmostEqual(log_split_posterior, -20.645423333333333)
        self.assertAlmostEqual(log_joined_posterior, -19.400790000000001)
Пример #2
0
    def test_get_all_stats_corner_cases(self): 
        self.spell_checker.get_posterior_fn = self.dummy_posterior_fn
        query_list = ['yo boyz i am sing song',
                      'faster and faster edits',
                      'jack in a bark floor']
        query_list = [Suggestion(suggestion_str = query) for query in query_list]
        self.spell_checker.run_spell_check(query_list)
        # pprint(self.spell_checker.get_suggestion_dict())

        # key's dict value is empty
        human_dict = {
            query_list[0]: [],
            query_list[1]: [Suggestion(['fast', 'an', 'fast', 'edit'])],
            query_list[2]: [Suggestion(['jack', 'an', 'an', 'bar', 'foo'])],
            }
        actual_stats = self.spell_checker.get_all_stats(human_dict)
        expected_stats = [0.5, 0.66666666666666663, 0.57142857142857151]
        self.assertEqual(actual_stats,
                         expected_stats)

        # All keys' dict values are empty
        human_dict = {
            query_list[0]: [],
            query_list[1]: [],
            query_list[2]: [],
            }
        actual_stats = self.spell_checker.get_all_stats(human_dict)
        expected_stats = [0.0, 0.0, 0.0]
        self.assertEqual(actual_stats,
                         expected_stats)
Пример #3
0
    def test_run_spell_check(self):
        # Setting this here so that we don't have to call MS N-gram API
        self.spell_checker.get_posterior_fn = self.dummy_posterior_fn
        query_list = [Suggestion(['yo', 'boyz'])]
        self.spell_checker.run_spell_check(query_list)

        self.assertEqual(
            self.spell_checker.generate_suggestions_and_posteriors(
                Suggestion(suggestion_str = 'yo boyz')),
            self.spell_checker.suggestion_dict[query_list[0]])
Пример #4
0
 def test_get_all_stats(self): 
     self.spell_checker.get_posterior_fn = self.dummy_posterior_fn
     query_list = ['yo boyz i am sing song',
                   'faster and faster edits',
                   'jack in a bark floor']
     query_list = [Suggestion(suggestion_str = query) for query in query_list]
     self.spell_checker.run_spell_check(query_list)
     # pprint(self.spell_checker.get_suggestion_dict())
     human_dict = {
         query_list[0]: [Suggestion(['yo', 'boyz', 'am', 'am', 'sing', 'song'])],
         query_list[1]: [Suggestion(['fast', 'an', 'fast', 'edit'])],
         query_list[2]: [Suggestion(['jack', 'an', 'an', 'bar', 'foo'])],
         }
     actual_stats = self.spell_checker.get_all_stats(human_dict)
     expected_stats = [0.61111111111111105, 1.0, 0.75862068965517226]
     self.assertEqual(actual_stats,
                      expected_stats)
Пример #5
0
def get_human_suggestions(test_label, filename):
    """Return human_suggestion_dict read from filename.

    Each of the sentences is like 'Yo boyz i am sing song.' with the
    capitalization and period preserved.
    """
    f = open(filename, 'r')
    file_input = [line.strip().split('\t') for line in f]
    f.close()
    # print 'file_input', file_input
    human_suggestion_dict = dict([(Suggestion(suggestion_str = line_elements[0], 
                                              suggestion_type = test_label[:-1]), 
                                   [Suggestion(suggestion_str = phrase, 
                                               suggestion_type = test_label[:-1])
                                    for phrase in line_elements[1:]]) 
                                  for line_elements in file_input])
    return human_suggestion_dict
Пример #6
0
    def __init__(self):
        self.preprocessing = Preprocessing()
        self.dictionary = Dictionary()
        self.cleaner = Cleaner()
        self.suggestion = Suggestion()

        self.word_val_in_dic = None
        self.word_val_in_bigram = None
        self.word_status_code = self.CODE_PENDING

        self.textual_status = {
            0: 'In_Progress',
            1: 'Correct Word',
            2: 'Non-Word Error',
            3: 'Real-Word Error',
            4: 'Real-Word Error Ignored',
            5: 'Case Error'
        }
 def generate_candidate_suggestions(self, term_possibilities_list, suggestion_type):
     """Return list of candidate Suggestions by combining all possibilities.
     
     Arguments:
     - `term_possibilities_list`: list of list of possibilities for
       each term in the query phrase.
     """
     # suggestion is a tuple, so converting it to a list
     return [Suggestion(list(suggestion), suggestion_type = suggestion_type) 
             for suggestion in itertools.product(*term_possibilities_list)]
Пример #8
0
def get_posterior (suggestion, query):
    """Return P(suggestion | query).

    Arguments:
    - `query`: Suggestion object
    - `suggestion`: Suggestion object
    """
    # TODO: Check whether it is good to call the MS API with a phrase
    # instead of a sentence with capitalized first word and a period
    # at the end.
    phrase_suggestion = Suggestion(suggestion.term_list, suggestion_type = 'phrase')
    return math.exp(get_prior (str(suggestion)) + get_likelihood (query, suggestion))
Пример #9
0
def get_output_from_file(test_label, filename):
    """Return output of spell check from filename.

    The file contains query TAB suggestion1 TAB posterior1 TAB ...
    Output: [query_list, suggestion_dict]
    """
    f = open(filename, 'r')
    file_input = [line.strip().split('\t') for line in f]
    f.close()
    # print 'file_input', file_input
    suggestion_dict = dict((Suggestion(suggestion_str = line_elements[0], 
                                       suggestion_type = test_label[:-1]), 
                            zip ([Suggestion(suggestion_str = suggestion_str, 
                                             suggestion_type = test_label[:-1]) 
                                             for suggestion_str in line_elements[1::2]], 
                                 map(float, line_elements[2::2])))
                            for line_elements in file_input)
    query_list = suggestion_dict.keys()
    # print 'suggestion_dict', suggestion_dict
    # print 'query_list', query_list
    return [query_list, suggestion_dict]
Пример #10
0
    def test_generate_suggestions_and_posteriors(self):
        # Note: All this is with our tiny dummy lexicon
        query = Suggestion(suggestion_str = 'wheere are yu going')

        suggestions = self.spell_checker.generate_suggestions_and_posteriors(
            query,
            get_posterior_fn = self.dummy_posterior_fn)
        expected_suggestion_list = [['wheere', 'an', 'yo', 'going'], 
                                    ['wheere', 'am', 'yo', 'going'], 
                                    ['wheere', 'bar', 'yo', 'going']]
               
        expected_posterior_list = [self.dummy_posterior] * 3

        actual_suggestion_list, actual_posterior_list = [list(produced_tuple) 
                                                         for produced_tuple 
                                                         in zip(*suggestions)]
        
        self.assertEqual(actual_suggestion_list,
                         expected_suggestion_list)
        self.assertEqual(actual_posterior_list,
                         expected_posterior_list)

        query = Suggestion(['yo', 'boyz'])
        suggestions = self.spell_checker.generate_suggestions_and_posteriors(
            query,
            get_posterior_fn = self.dummy_posterior_fn)
        expected_suggestion_list = [Suggestion(['yo', 'boyz'])]
               
        expected_posterior_list = [1.0]

        actual_suggestion_list, actual_posterior_list = [list(produced_tuple) 
                                                         for produced_tuple 
                                                         in zip(*suggestions)]
        
        self.assertEqual(actual_suggestion_list,
                         expected_suggestion_list)
        self.assertEqual(actual_posterior_list,
                         expected_posterior_list)
Пример #11
0
def get_inputs(test_label, filename = '../data/words.input'):
    """Return list of input queries read from filename.

    Lowercase all the words.
    If a query is a sentence, remove the period at the end.
    """
    f = open(filename, 'r')
    query_list = [Suggestion(suggestion_str = line.strip(), 
                             suggestion_type = test_label[:-1]) 
                             for line in f]
    f.close()
    
    print 'query_list', query_list
    return query_list
Пример #12
0
 def __build_graph(self, term, depth=0):
     if depth == self.MAX_DEPTH:
         return
     suggestions = Suggestion(term).get_suggestion()
     for suggestion in suggestions:
         if " vs " not in suggestion:
             continue
         first_word, second_word = suggestion.split(" vs " , maxsplit=1)
         if second_word not in self.__nodes:
             self.__nodes.append(second_word)
             edge = (first_word, second_word)
             reversed_edge = (second_word, first_word)
             if edge not in self.__edges and reversed_edge not in self.__edges:
                 self.__edges.append(edge)
             self.__build_graph(second_word, depth=depth + 1)
Пример #13
0
def get_corrected_run_on_queries(query, lexicon):
    """Correct run-on query by splitting run-on words.

    Return list of phrase/sentence queries with any run-on word split.

    Assumption: A maximum of max_num_splits words have been joined together.
    Arguments:
    - `query`: Suggestion object
    - `lexicon`: lexicon of the spell checker
    """
    max_num_splits = 3
    # print query

    # List of list of suggestions for each word
    term_suggestions_list = [
        list(
            itertools.chain(*[
                get_splits(word, i, lexicon)
                for i in xrange(1, max_num_splits + 1)
            ])) + [[word]] for word in query
    ]
    # print 'term_suggestions_list', term_suggestions_list

    # All term_combos (considering only one word to be a run-on word
    # at a time)
    term_combos = [
        list(itertools.chain(*tuple_)) for i in xrange(len(query))
        for tuple_ in itertools.product([query[:i]], term_suggestions_list[i],
                                        [query[i + 1:]])
    ]

    # print 'term_combos', term_combos
    term_combos.sort()
    # Remove duplicates
    # This requires that keys with same value be consecutive (hence sort).
    term_combos = [key for key, _ in itertools.groupby(term_combos)]
    # print 'term_combos', term_combos
    term_combos.remove(query)
    # print 'term_combos', term_combos
    return [
        Suggestion(term_combo,
                   suggestion_type='sentence'
                   if query.suggestion_type == 'sentence' else 'phrase')
        for term_combo in term_combos
    ]
Пример #14
0
    def test_get_likelihood(self):
        query = Suggestion(['foo'])
        suggestion = Suggestion(['bar'])
        expected = phrase.error_penalization * -(
            phrase.get_edits(suggestion[0], query[0])[0] / len(str(query)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected, 2)

        # phrase with splits joined up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=split_str)
        suggestion = Suggestion(suggestion_str=joined_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost /
                                                 len(split_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with joins split up
        split_str = 'fo o bar roc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=joined_str)
        suggestion = Suggestion(suggestion_str=split_str)
        expected = phrase.error_penalization * -(2 * phrase.space_edit_cost /
                                                 len(joined_str))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)

        # phrases with splits and errors
        split_str = 'fo o bcr rxc ks'
        joined_str = 'foo bar rocks'
        query = Suggestion(suggestion_str=split_str)
        suggestion = Suggestion(suggestion_str=joined_str)
        edit_distance = (phrase.get_edits(''.join(joined_str.split()),
                                          ''.join(split_str.split()))[0]\
                                          + 2 * phrase.space_edit_cost)
        expected = phrase.error_penalization * -(edit_distance /
                                                 len(str(split_str)))
        self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                               expected)
Пример #15
0
def get_corrected_split_queries(query, lexicon):
    """Correct split query by joining words.

    Return list of word/phrase/sentence queries with the split words joined.

    Assumption: a word has been split only once.
    Note: The original query is NOT part of the returned list.

    Arguments:
    - `query`: Suggestion object
    - `lexicon`: lexicon of the spell checker
    """
    # TODO: Should probably check to see if the resultant suggestion
    # is a word/phrase/suggestion and then set its suggestion_type.
    # eg. 'No w.' (sentence) -> 'Now.' (word)
    joined_up_suggestion_list = [
        Suggestion(query[:i] + [query[i] + query[i + 1]] + query[i + 2:],
                   suggestion_type=query.suggestion_type)
        for i in range(len(query) - 1)
        if lexicon.is_known_word(query[i] + query[i + 1])
    ]
    return joined_up_suggestion_list
Пример #16
0
    def fetch(self):
        self.driver.get(self.url)

        self.doAfterPageLoad()

        for concept in concepts:
            for query in concept["queries"]:
                phrase = query["query"].format(concept = concept["name"])
                language = query["lang"]

                if not (self.onlyEnglish and language != "en"):
                
	                self.doBeforeQuery()

	                q = self.getSearchBox()

	                q.clear()

	                time.sleep(self.waitBeforeQuery)

	                # Type in the query
	                for letter in phrase:
	                    q.send_keys(letter)
	                    time.sleep(.05)
	                # Extract the suggestions
	                time.sleep(self.waitAfterQuery)
	                suggestions = self.getSuggestions()

	                # Stores the suggestions in the DB 
	                for suggestion in suggestions.text.split("\n") :
	                    Suggestion(concept=concept["name"]
	                             , phrase=phrase
	                             , suggestion=suggestion
	                             , language=language
	                             , country_IP=self.country_IP
	                             , service=self.service).save()
Пример #17
0
        
        Arguments:
        - `human_suggestion_dict`:
        """

        if query_list == None:
            query_list = self.query_list
        if suggestion_dict == None:
            suggestion_dict = self.suggestion_dict

        args = [self.query_list, self.suggestion_dict, human_suggestion_dict]
        return [utils.get_EP(*args), utils.get_ER(*args), 
                self.get_EF1_measure(human_suggestion_dict)]

if __name__ == '__main__':
    spell_checker = SpellChecker()
    query_list = ['The departments of the institute offer corses, conducted by highly qualified staff.']
    query_list = [Suggestion(suggestion_str = query, suggestion_type = 'sentence') 
                  for query in query_list]
    spell_checker.run_spell_check(query_list)
    print 'spell_checker.get_suggestion_dict()'
    pprint(spell_checker.get_suggestion_dict())
    # human_dict = { query_list[0]: [Suggestion('why this kolaveri'.split())], 
    #                query_list[1]: [Suggestion('i am sing song'.split())] }
    # print 'spell_checker.get_EF1_measure(human_dict)'
    # pprint(spell_checker.get_EF1_measure(human_dict))
    # print 'spell_checker.get_all_stats(human_dict)'
    # pprint(spell_checker.get_all_stats(human_dict))


Пример #18
0
if __name__ == "__main__":
    # print get_edits ("belie", "belive")
    # print get_posterior (["belie"], ["belive"])
    # for suggestion in generate_all_candidate_suggestions ("i can haz cheezburger".split()) :
    #     print suggestion
    #     print get_prior(suggestion)
        # print get_posterior(suggestion, 'cat aret gonne'.split())
        # print " ".join (suggestion)
    # print get_edits ("sujeet", "usjeet")
    # pass

    # # Test the `accepts` decorator
    # @accepts(list, int, int)
    # @returns(float)
    # def average(x, y, z):
    #     return (x[0] + y + z) / 2
    # average([13], 10, 15.0)
    # average([3], 10, 15)

    s1 = 'The departments of the institute offer horses conducted by highly qualified staff.'
    s2 = 'The departments of the institute offer courses conducted by highly qualified staff.'
    print get_prior(s1)
    print get_prior(s2)
    sugg1 = Suggestion(suggestion_str = s1, suggestion_type = 'sentence')
    sugg2 = Suggestion(suggestion_str = s2, suggestion_type = 'sentence')
    q = Suggestion(suggestion_str = 'The departments of the institute offer corses, conducted by highly qualified staff.', suggestion_type = 'sentence')	
    print get_posterior(sugg1, q)
    print get_posterior(sugg2, q)

Пример #19
0
def suggest(jdID, db):
	_obj = Suggestion(jdID, db)
	_suggestion = _obj.get_suggestion()

	return _suggestion
Пример #20
0
 def test_get_likelihood_splits_special(self):
     # TODO
     suggestion_likelihood_list = [
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'horses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'courses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0092284029735965143),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'horses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'horses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), 0.0),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'cores', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.0015698587127158554),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'cores', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0015887726731100226),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'cores', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0015887726731100226),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'cortes', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.007872379721119217),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0079672276695664374),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'cortes', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0079672276695664374),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'courses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.0091185410334346517),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'courses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.0092284029735965143),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'torses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.010714285714285714),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'torses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.010843373493975905),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'torses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.010843373493975905),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'curses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.010989010989010988),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'curses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.011121408711770158),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'curses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.011121408711770158),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corsets', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.012605042016806723),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.012756909992912829),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corsets', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.012756909992912829),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corset', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.013632791245994799),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corset', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.013797041742934496),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corset', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.013797041742934496),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'copses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.030612244897959183),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'copses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.030981067125645436),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'copses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.030981067125645436),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
             'offer', 'corses', 'conducted', 'by', 'highly', 'qualified',
             'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'ins', 'tit', 'ute',
              'offer', 'corpses', 'conducted', 'by', 'highly', 'qualified',
              'staff'
          ]), -0.034985422740524783),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'offer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'offer',
              'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.035406933857880504),
         (Suggestion([
             'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
             'corses', 'conducted', 'by', 'highly', 'qualified', 'staff'
         ]),
          Suggestion([
              'the', 'departments', 'of', 'the', 'institute', 'of', 'fer',
              'corpses', 'conducted', 'by', 'highly', 'qualified', 'staff'
          ]), -0.035406933857880504)
     ]
     for i, _tuple in enumerate(suggestion_likelihood_list):
         query, suggestion, likelihood = _tuple
         # print 'suggestion number: ', i
         self.assertAlmostEqual(phrase.get_likelihood(query, suggestion),
                                likelihood, 2)
Пример #21
0
class Lookup:

    CODE_PENDING = 0
    CODE_CORRECT = 1
    CODE_NON_WORD_ERR = 2
    CODE_REAL_WORD_ERR = 3
    CODE_RWE_IGNORED = 4
    CODE_CASE_ERR = 5

    def __init__(self):
        self.preprocessing = Preprocessing()
        self.dictionary = Dictionary()
        self.cleaner = Cleaner()
        self.suggestion = Suggestion()

        self.word_val_in_dic = None
        self.word_val_in_bigram = None
        self.word_status_code = self.CODE_PENDING

        self.textual_status = {
            0: 'In_Progress',
            1: 'Correct Word',
            2: 'Non-Word Error',
            3: 'Real-Word Error',
            4: 'Real-Word Error Ignored',
            5: 'Case Error'
        }

    def load_raw_text(self, file_path=None, passage=None):
        if (file_path == None and passage == None):
            raise Exception(
                'File path or passage should be provided. File path has priority!'
            )

        text = self.cleaner.clean_corpus(in_file=file_path, passage=passage)

        if (text):
            text = self.cleaner.bigram_preparation(passage=text)

            return [
                self.preprocessing.fetch_line_words(line, escape_symbols=False)
                for line in text.split("\n")
            ]

        return []

    def validate_word(self, word, prev_word):
        if (self.preprocessing.is_customized_word(word[0])):
            lemma = word[0]
        else:
            lemma = self.preprocessing.fetch_lemmatized_word(word[0], word[1])

        suggestions = dict()

        if (self.word_exists(word[0], lemma)):
            if (self.word_in_real_place(word, prev_word)):
                if (self.word_in_correct_case(word[0])):
                    self.word_status_code = self.CODE_CORRECT
                else:
                    self.word_status_code = self.CODE_CASE_ERR
            elif (self.word_can_be_real(word, prev_word)):
                self.word_status_code = self.CODE_RWE_IGNORED
            else:
                self.word_status_code = self.CODE_REAL_WORD_ERR
        else:
            self.word_status_code = self.CODE_NON_WORD_ERR

        if (self.word_status_code
                not in [self.CODE_CORRECT, self.CODE_CASE_ERR]):
            suggestions = self.suggestion.get_suggestions(
                word, prev_word, self.word_status_code)
            if (word[0].lower() in suggestions):
                self.word_status_code = self.CODE_CORRECT

        return {
            'word': word,
            'prev_word': prev_word,
            'status': self.word_status_code,
            'textual_status': self.textual_status[self.word_status_code],
            'lemma': lemma,
            'suggestions': suggestions
        }

    def word_exists(self, main_word, lemmatized_word):
        if (self.preprocessing.is_customized_word(main_word)):
            self.word_val_in_dic = self.dictionary.CASE_BOTH
            return True

        self.word_val_in_dic = self.dictionary.get_single_word_from_dic(
            lemmatized_word)

        if (self.word_val_in_dic == None
                and self.dictionary.words_really_different(
                    main_word, lemmatized_word)):
            self.word_val_in_dic = self.dictionary.add_single_word2dic(
                main_word, lemmatized_word)

        return False if self.word_val_in_dic == None else True

    def word_in_real_place(self, word, prev_word):
        if (prev_word[0] == None):
            return True
        else:
            self.word_val_in_bigram = self.dictionary.get_single_word_from_bigram(
                word[0])

            if (self.word_val_in_bigram != None and prev_word[0].lower()
                    in self.word_val_in_bigram['prev_words']):
                return True

        return False

    def word_can_be_real(self, word, prev_word):
        prev_pos = prev_word[0] if self.preprocessing.is_customized_word(
            prev_word[0]) else prev_word[1]

        if (self.word_val_in_bigram != None and prev_pos != None
                and self.word_val_in_bigram['pos'] != None
                and self.word_val_in_bigram['prev_pos'] != None
                and word[1].lower() in self.word_val_in_bigram['pos']
                and prev_pos.lower() in self.word_val_in_bigram['prev_pos']):

            return True

        return False

    def word_in_correct_case(self, main_word):
        if (re.match(r"^([A-Z][a-z]+(\-\_)?)+$", main_word)):
            main_word = main_word.lower()

        if (re.match(r"^[A-Z]+(s|es)$", main_word)):
            main_word = re.sub(r"(s|es)$", '', main_word)
            new_val = self.dictionary.get_single_word_from_dic(main_word)
            if (new_val != None):
                self.word_val_in_dic = new_val

        if (len(main_word) > 1 and not main_word[1:].islower()
                and not main_word[1:].isupper()):
            return False

        if (self.word_val_in_dic == None):
            return True

        current_case = self.dictionary.get_word_case(main_word)

        if (self.word_val_in_dic == self.dictionary.CASE_BOTH
                or current_case == self.word_val_in_dic):
            return True

        return False