def nameCompare(name1, name2): """the function checks by typo's and nicknames if two names are equals. returns false if differnent names and true if equals""" if name1 == name2: return True spell = SpellChecker(distance=1) nameSet1 = spell.edit_distance_1(name1) nameSet2 = spell.edit_distance_1(name2) candidateSet1 = set() candidateSet2 = set() with open('nicknames.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: row1 = row[1][1:].lower() row2 = row[2][1:].lower() for name in nameSet1: if name.lower() == row1 or name.lower() == row2: candidateSet1.add(row1) candidateSet1.add(row2) for name in nameSet2: if name.lower() == row1 or name.lower() == row2: candidateSet2.add(row1) candidateSet2.add(row2) if len(set.intersection(candidateSet1, candidateSet2)) > 0: return True return False
class Spell_Searcher: def __init__(self, indexer): self._indexer = indexer self.spell = None def query_expansion(self, query): """ This function finds a misspelled word and finds its closest similarity. first by tracking all of its candidates. the candidate with the most appearances in the inverted index will be the "replaced" :param query: query dictionary :return: query dictionary with replaced correct words. """ try: self.spell = SpellChecker(local_dictionary='spell_dict.json', distance=1) except: pass query_dict = query.query_dict for term in query_dict: if term.lower() not in self._indexer.inverted_idx and term.upper() not in self._indexer.inverted_idx: misspelled_checker = self.spell.unknown([term]) if len(misspelled_checker) != 0: candidates = list(self.spell.edit_distance_1(term)) super_candidates = list(self.spell.candidates(term)) candidates.extend(super_candidates) max_freq_in_corpus = 0 max_freq_name = '' for i, candidate in enumerate(candidates): if candidate in self._indexer.inverted_idx: curr_freq = self._indexer.inverted_idx[candidate] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate elif candidate.upper() in self._indexer.inverted_idx: curr_freq = self._indexer.inverted_idx[candidate.upper()] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate if max_freq_name != '': print(max_freq_name) query_dict[max_freq_name] = query_dict.pop(term) else: continue
def search(self, query): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results. Input: query - string. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ searcher = Searcher(self._parser, self._indexer, model=self._model) # spell checker query_as_list = self._parser.parse_sentence(query) inverted_idx = self.indexer.inverted_idx spell = SpellChecker() misspelled = spell.unknown(query_as_list) assist = [x.lower() for x in query_as_list] # all the query terms in lower case for word in misspelled: if word.upper() in inverted_idx.keys() or word.lower( ) in inverted_idx.keys() or ' ' in word: continue # if the word is in the inverted index- no correction need word_idx = assist.index(word) corrections = spell.edit_distance_1( word ) # list of all the suggested corrections with distance value 1 corrections_dict = {} # check if the suggested corrections is in inverted index and collect the frequency of each correction for correction in corrections: if correction.upper() in inverted_idx.keys(): corrections_dict[correction] = inverted_idx[ correction.upper()] if correction.lower() in inverted_idx.keys(): corrections_dict[correction] = inverted_idx[ correction.lower()] if corrections_dict: query_as_list[word_idx] = max( corrections_dict, key=corrections_dict.get ) # choose the most common correction else: query_as_list[word_idx] = spell.correction(word) new_query = ' '.join(query_as_list) relevant_docs = searcher.search(new_query) return relevant_docs
def SurNameCompare(name1, name2): """checks if two surnames are equals by surnames database and typo's. returns false if differnent names and true if equals""" if name1 == name2: return True spell = SpellChecker(distance=1) nameSet1 = spell.edit_distance_1(name1) nameSet2 = spell.edit_distance_1(name2) candidateSet1 = set() candidateSet2 = set() with open('surnames.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=' ') for row in csv_reader: for name in nameSet1: if name.lower() == row[0].lower(): candidateSet1.add(row[0].lower()) for name in nameSet2: if name.lower() == row[0].lower(): candidateSet2.add(row[0].lower()) if len(set.intersection(candidateSet1, candidateSet2)) > 0: return True return False
def test_checking_odd_word(self): ''' test checking a word that is really a number ''' spell = SpellChecker() self.assertEqual(spell.edit_distance_1('12345'), {'12345'})
class Searcher: def __init__(self, inverted_index, number_of_documents, load_path): """ :param inverted_index: dictionary of inverted index :param number_of_documents: number of documents in the corpus """ self.parser = Parse() self.ranker = Ranker() self.inverted_index = inverted_index self.current_file_name = "" self.current_posting = None self.term_posting_dict = {} self.sorted_query_dict = {} self.number_of_documents = number_of_documents self.docs_dict = {} self.spell = SpellChecker(local_dictionary='spell_dict.json', distance=1) self.load_path = load_path + "\\{}" def relevant_docs_from_posting(self, query): """ This function loads the posting list and counts the amount of relevant documents per term. :param query: query :return: dictionary of relevant documents. """ query_dict = query.query_dict query_dict = self.spell_correction(query_dict) for term in query_dict: if term in self.inverted_index: continue elif term.isupper() and term not in self.inverted_index: if term.lower() in self.inverted_index: query_dict[term.lower()] = query_dict.pop(term) elif term.islower() and term not in self.inverted_index: if term.upper() in self.inverted_index: query_dict[term.upper()] = query_dict.pop(term) self.sorted_query_dict = {k: query_dict[k] for k in sorted(query_dict)} for term in self.sorted_query_dict: if term in self.inverted_index: posting_file_to_load = self.inverted_index[term][1] else: continue if posting_file_to_load != self.current_file_name: self.current_file_name = posting_file_to_load self.current_posting = self.read_posting(posting_file_to_load) if term in self.current_posting: self.term_posting_dict[term] = self.current_posting[term] self.document_dict_init(self.term_posting_dict, query.query_length) return self.docs_dict def spell_correction(self, query_dict): """ This function finds a misspelled word and finds its closest similarity. first by tracking all of its candidates. the candidate with the most appearances in the inverted index will be the "replacer" :param query: query dictionary :return: query dictionary with replaced correct words. """ for term in query_dict: if term.lower() not in self.inverted_index and term.upper( ) not in self.inverted_index: misspelled_checker = self.spell.unknown([term]) if len(misspelled_checker) != 0: candidates = list(self.spell.edit_distance_1(term)) super_candidates = list(self.spell.candidates(term)) candidates.extend(super_candidates) max_freq_in_corpus = 0 max_freq_name = '' for i, candidate in enumerate(candidates): if candidate in self.inverted_index: curr_freq = self.inverted_index[candidate][0] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate elif candidate.upper() in self.inverted_index: curr_freq = self.inverted_index[ candidate.upper()][0] if curr_freq > max_freq_in_corpus: max_freq_in_corpus = curr_freq max_freq_name = candidate if max_freq_name != '': query_dict[max_freq_name] = query_dict.pop(term) else: continue return query_dict def read_posting(self, posting_name): """ This function seeks for the file name and reads it from the disk. :param posting_name: file name :return: posting file """ pickle_in = open(self.load_path.format(posting_name), "rb") dict_to_load = pickle.load(pickle_in) pickle_in.close() return dict_to_load def document_dict_init(self, postings_dict, query_length): """ This function initiates the sorted dictionary that will contain each term of the query and its corresponding posting list :param postings_dict: a dictionary of term (key) and a posting list (value) :param query_length: query length :return: """ tf_idf_list = [0] * query_length sorted_posting_dict = { k: postings_dict[k] for k in sorted(postings_dict) } for idx, (term, doc_list) in enumerate(sorted_posting_dict.items()): for doc_tuple in doc_list: if doc_tuple[0] not in self.docs_dict: self.docs_dict[doc_tuple[0]] = tf_idf_list try: dfi = self.inverted_index[term][2] except: dfi = self.inverted_index[term.lower()][2] idf = math.log(self.number_of_documents / dfi, 10) tf_idf = idf * doc_tuple[2] self.docs_dict[doc_tuple[0]][idx] = tf_idf tf_idf_list = [0] * query_length def normalized_query(self, query): """ This function normalizes each term in the auery by the max term freq in the SORTED query dict. :param query: a query object :return: normalized query values """ normalized = [] max_freq_term = query.max_freq_term for key in self.sorted_query_dict: tf = self.sorted_query_dict[key] normalized.append(tf / max_freq_term) return normalized