def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ query_as_dict = self._parser.parse_query(query) # thesaurus for word in query_as_dict.copy().keys(): if len(thes.synonyms(word)[1][1]): syn = list(thes.synonyms(word)[1][1])[:30] for s in syn: if s not in query_as_dict and s in self._indexer.inverted_idx: query_as_dict[s] = 1 break relevant_docs = self._relevant_docs_from_posting(query_as_dict) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) # print("SE4 top5:") # print(ranked_doc_ids[:5]) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids
def teasarous_(self, query): new_query = [] new_query.extend(query) try: for word in query: new_word = list(thesaurus.synonyms(word, fileid="simN.lsp")) new_word_1 = list(thesaurus.synonyms(word, fileid="simV.lsp")) if len(new_word) > 0: new_query.append(new_word[0]) if len(new_word_1): new_query.append(new_word_1[0]) except: pass # print("no word for theasarous") # print(len(new_word)) return new_query
def expand_query_theasaurus(self, query): expand_set = set() for term in query: sys_list = list(thesaurus.synonyms(term, fileid="simN.lsp")) if len(sys_list) > 0 and sys_list[0] not in query: expand_set.add(sys_list[0]) # [query.append(term) for term in expand_set] return list(expand_set)
def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print(f"Similarity score for {word1} and {word2}:") print(thes.similarity(word1, word2))
def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Similarity score for %s and %s:" % (word1, word2)) print(thes.similarity(word1, word2))
def do_thesaurus(query): lowered = [] toAdd = set() # lower every word in query for word in query: lowered.append(word.lower()) # Go over every word in query for word in lowered: counterNoMoreThen4 = 0 dictionary = thes.synonyms(word)[1][1] # find similar expressions and their scores listOfScores = thes.scored_synonyms(word)[1][1] dictOfScored = dict(listOfScores) # print("\n word: ",word) # print(dictOfScored) # print(dictionary) # Go over the thesaurus words #for idx, syn in enumerate(dictionary): # related.append(syn) # Go over the scored dictionary for key in dictOfScored: # Check if similar enough and no more then 4 per word if dictOfScored[key] > 0.21 and key not in lowered and counterNoMoreThen4 < 4: counterNoMoreThen4 += 1 # if the similar term contains ' ' if key.__contains__(' '): splited = key.split() # add only relevant term for term in splited: if term not in lowered: toAdd.add(term) else: toAdd.add(key) elif counterNoMoreThen4 == 4: # Too many terms for word continue #print("word: ",word," similar:",list(toAdd)) # Lower term in listToAdd listToAdd = list(toAdd) for i, term in enumerate(toAdd): listToAdd[i] = term.lower() #print("list: ", listToAdd) #print("how much: ", len(listToAdd)) return listToAdd
def thesaurus(terms): extended_terms = set() for query_word in terms: if query_word == "trump": continue synonyms = linthesaurus.synonyms(query_word) for sim, keys in synonyms: if len(keys) > 1: keys_list = list(keys) if len(keys_list) > 2: keys_list = keys_list[:2] # add only 2 extended_terms.update(keys_list) return list(extended_terms)
def add_synonyms_to_list(tokens_list): """ Apply thesaurus synonym addition of one synonym per token in the list. (Performance + results relevance are the motivation to limit to one synonym per token). Returns the new tokens list including the originn and the synonyms """ out_list = [] for token in tokens_list: out_list.append(token) for syn in thes.synonyms(token, fileid="simN.lsp"): out_list.append(syn) break return out_list
def extend_query(self): word_to_add = [] if len(self.query) == 0: return [] for word in self.query: list_of_thes = thesaurus.synonyms(word) #print(list_of_thes) for i in range(len(list_of_thes)): if len(list_of_thes[i][1]) > 1: word_to_add.append(list(list_of_thes[i][1])[0]) self.query.extend(word_to_add) return self.query
def thesaurus_method(self, query_list): """ This function use thesaurus synonym addition of one synonym per token in the query list. There is limitation to one synonym per token. Returns the new tokens list including the originn and the synonyms """ new_query_list = [] for token in query_list: for synonym_term in thes.synonyms(token, fileid="simN.lsp"): new_query_list.append(synonym_term) break query_list.extend(new_query_list) return query_list
def query_expansion(self, query): """ for each word in query.query_text apply Part Of Speach tagging. then, apply thesaurus for finding synonyms of each word in the query. expand the query with these synonyms/ :param query: :return: """ query_dict = query.query_dict query_length = query.query_length thes_dict = {} for word in query_dict.keys(): thes_dict[word] = query_dict[word] text = [word] word_pos = nltk.pos_tag(text) word_pos = self.tag(word_pos[0][1]) word_list_thesaurus = thesaurus.synonyms(word) if word_list_thesaurus: word_to_switch_list = [] max_counter = 10 chosen_words = [] if word_pos == "ADJ": word_to_switch_list = word_list_thesaurus[0][1] elif word_pos == "NOUN" or word_pos == "PROPN": word_to_switch_list = word_list_thesaurus[1][1] elif word_pos == "VERB": word_to_switch_list = word_list_thesaurus[2][1] for token in word_to_switch_list: if len(chosen_words) == max_counter: break split_token = token.split(" ") if len(split_token) > 1: continue if token in self._indexer.inverted_idx and token not in query_dict.keys(): chosen_words.append(token) for words in chosen_words: thes_dict[str(words)] = query_dict[word] query.query_length = len(thes_dict) query.query_dict = thes_dict
def get_synonym(word): """ finds term's synonym using lin thesaurus :param word: :return: """ synonyms_types = lt.synonyms(word[0]) pos_tag = word[1] if pos_tag.startswith('J'): synonyms_list = list(synonyms_types[0][1]) elif pos_tag.startswith('V'): synonyms_list = list(synonyms_types[2][1]) else: synonyms_list = list(synonyms_types[1][1]) if len(synonyms_list) > 0: return synonyms_list[0] return None
def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print "Getting synonyms for " + word1 print thes.synonyms(word1) print "Getting scored synonyms for " + word1 print thes.synonyms(word1) print "Getting synonyms from simN.lsp (noun subsection) for " + word1 print thes.synonyms(word1, fileid="simN.lsp") print "Getting synonyms from simN.lsp (noun subsection) for " + word1 print thes.synonyms(word1, fileid="simN.lsp") print "Similarity score for %s and %s:" % (word1, word2) print thes.similarity(word1, word2)
def get_term_synonym(self, tagged_term): chosen_syn = None try: syn_types = thesaurus.synonyms(tagged_term[0]) part_of_speech = tagged_term[1] if part_of_speech.startswith('V'): type = syn_types[2] elif part_of_speech.startswith('J'): type = syn_types[0] else: type = syn_types[1] if len(type[1]) > 0: chosen_syn = list(type[1])[0] except: return chosen_syn return chosen_syn
def search(self, query, k=None, methods=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relevant and the last is the least relevant result. """ # spell corrections if 1 in methods: spell = SpellChecker() query = ' '.join( [spell.correction(word) for word in query.split()]) query_terms = self._parser.Tokenize(query).keys() extenders = set() # wordNet if 2 in methods: for word in query_terms: for ex_word in self.wordNet(word.text): extenders.add(self._parser.add_to_dict(ex_word)) # lin_thesaurus if 3 in methods: for word in query_terms: for ex_word in list(thes.synonyms( word.text)[1][1])[:self._the_count]: extenders.add(self._parser.add_to_dict(ex_word)) extenders = {extender for extender in extenders if extender} w_of_term_in_query = self.CalculateW(query_terms, extenders) relevant_docs = self._relevant_docs_from_posting( w_of_term_in_query.keys()) ranked_doc_ids = self._ranker.rank_relevant_docs( relevant_docs, k, w_of_term_in_query) return len(ranked_doc_ids), ranked_doc_ids
def synonyms(words_to_check): """ The method will receive a query after parse, expand it by adding a noun synonym (if exists) to each word in the query using the lin_thesaurus module and return the updated query dictionary :param words_to_check: a parsed query {term: tf in dictionary} :return: updated query dictionary with added terms """ query_terms = [] for word in words_to_check: synonym_words = thesaurus.synonyms(word) if not synonym_words: continue # take noun words only if exist noun_synonyms = synonym_words[1] if len(noun_synonyms[1]) > 0: # take highest fit noun word noun_word = list(noun_synonyms[1])[0] query_terms.append(noun_word) Thesaurus.add_to_dict(words_to_check, query_terms) return words_to_check
def __init__(self, indexer): self._indexer = indexer w = thesaurus.synonyms("")
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ # parse query according to the same parsing rules of the corpus entities = {} term_dict = {} parsed_query = self._parser.parse_sentence(query, entities, stemming=self.stemming) self._parser.parse_capital_letters(parsed_query, term_dict) processed_query = [*term_dict.keys()] + [*entities.keys()] # perform spell correction if self.spell_correction: from spellchecker import SpellChecker spell_checker = SpellChecker() corrected_terms = [] # list all misspelled terms in the query misspelled_terms = spell_checker.unknown([*term_dict.keys()]) for term in misspelled_terms: # only correct terms that aren't in the inverted dictionary # terms in the dictionary are considered correct for retrieval if term not in self._indexer.inverted_idx: candidates = spell_checker.candidates(term) max_to_return = min(Searcher.TOP_N, len(candidates)) candidates = candidates[: max_to_return] # return only the top 3 results if term in candidates: # remove duplicate originally correct terms candidates.remove(term) for candidate in candidates: # remove corrections already in query if candidate in parsed_query: candidates.remove(candidate) corrected_terms.extend(candidates) processed_query += corrected_terms # extend query with corrected words if self.thesaurus: from nltk.corpus import lin_thesaurus as thes candidates = [] for term in processed_query: synsets = thes.synonyms(term) for synset in synsets: synonyms = [*synset[1]] if len(synonyms) > 0: max_to_return = min(Searcher.TOP_N, len(synonyms)) best_synonyms = synonyms[:max_to_return] for synonym in best_synonyms: if synonym != term and synonym not in processed_query and synonym in self._indexer.inverted_idx: candidates.append(synonym) # extend the query break processed_query += candidates if self.wordnet: from nltk.corpus import wordnet print("wordenting") candidates = [] for term in processed_query: print(f"term {term}:") synsests = wordnet.synsets(term) # retrieve best syn_sets max_to_return = min(Searcher.TOP_N, len(synsests)) synsests = synsests[0:max_to_return] print("returned synsets") skip = False for synset in synsests: for lemma in synset.lemmas( )[:max_to_return]: # possible synonyms print(f"possible lemma: {lemma.name()}") if lemma.name() != term and lemma.name( ) not in processed_query and lemma.name(): if lemma.name() in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break elif lemma.name().lower( ) in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break elif lemma.name().upper( ) in self._indexer.inverted_idx: candidates.append(lemma.name()) print(f"appended {lemma.name()}") skip = True break if skip: break parsed_query += candidates # dictionary for holding all relevant documents (at least one query term appeared in the document) # format: {document_id: score} relevant_docs = {} for term in processed_query: # check if term exists in inverted dictionary in either lower or upper form if term in self._indexer.inverted_idx: self.calculate_doc_scores(term, relevant_docs) elif term.islower() and term.upper() in self._indexer.inverted_idx: self.calculate_doc_scores(term.upper(), relevant_docs) elif term.isupper() and term.lower() in self._indexer.inverted_idx: self.calculate_doc_scores(term.lower(), relevant_docs) n_relevant = len(relevant_docs) ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs) return n_relevant, ranked_doc_ids
def search(self, query, k=None): """ Executes a query over an existing index and returns the number of relevant docs and an ordered list of search results (tweet ids). Input: query - string. k - number of top results to return, default to everything. Output: A tuple containing the number of relevant search results, and a list of tweet_ids where the first element is the most relavant and the last is the least relevant result. """ # all_dicts = self._indexer.load_index('inverted_idx.pkl') inverted_index = self._indexer.inverted_idx posting = self._indexer.postingDict documents = self._indexer.documents dict_of_methods = self._indexer.dict_of_method if dict_of_methods['wordnet']== True: #wordnet method doc_query_app = self.finished_dict(query, inverted_index) # first parse query words list_of_query = doc_query_app.keys() words_to_add = {} # get each query word its synsets and add to query the ones that in inverted index for word in list_of_query: opt = wordnet.synsets(word) for i in range(len(opt)): check_word = opt[i].lemmas()[0].name() if check_word in doc_query_app.keys() or check_word in words_to_add.keys(): continue tested = self._indexer.check_upper_lower(inverted_index, check_word) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): continue if tested[1] is True: words_to_add[tested[0]] = 0.0001 elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.0001 doc_query_app.update(words_to_add) elif dict_of_methods['spell_correction']== True: spell = SpellChecker(case_sensitive=True) query_as_list = query.split() for index in range(len(query_as_list)): is_upper = False word = query_as_list[index] # if word from query not in inverted index look for correction- take the first one that is in inverted index if self._indexer.check_upper_lower(inverted_index, word)[1] is False: # word not in inverted index if word[0].isupper() is True: is_upper = True options = spell.candidates(word) is_found = False i = 0 options = list(options) while i < len(options): if self._indexer.check_upper_lower(inverted_index, options[i])[1] is True: corrected = options[i] is_found = True break i += 1 # corrected = spell.correction(word) if is_found is not False and corrected != query_as_list[index]: if is_upper is True: corrected = corrected.capitalize() query_as_list[index] = corrected doc_query_app = self.finished_dict(" ".join(query_as_list), inverted_index) elif dict_of_methods['word2vec'] == True: words_to_add = {} doc_query_app = self.finished_dict(query, inverted_index) query_as_list = query.split() insert_new_words = [] for word in query_as_list: if word in self._model.wv.wv.vocab: lst_sim_word_model = self._model.most_similar(word.lower()) for similiar_word in lst_sim_word_model: if similiar_word[1] > 0.33: insert_new_words.append(similiar_word[0]) # if len(insert_new_words) == 0: # continue idx = 0 while idx < len(insert_new_words): if insert_new_words[idx] in doc_query_app.keys() or insert_new_words[idx] in words_to_add.keys(): idx += 1 continue tested = self._indexer.check_upper_lower(inverted_index, insert_new_words[idx]) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): idx += 1 continue if tested[1] is True: words_to_add[tested[0]] = 0.6 break elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.6 break idx += 1 doc_query_app.update(words_to_add) elif dict_of_methods['thesaurus'] == True: doc_query_app = self.finished_dict(query, inverted_index) # first parse query words list_of_query = list(doc_query_app.keys()) words_to_add = {} # get each query word its synonyms and add to query the first that is in inverted index stop = set(stopwords.words('english')) results = [thes.synonyms(i, fileid="simN.lsp") for i in list_of_query if i not in stop] results_as_list = list(results) for words in results_as_list: inside_list = list(words) if len(inside_list) == 0: continue idx = 0 while idx < len(inside_list): if inside_list[idx] in doc_query_app.keys() or inside_list[idx] in words_to_add.keys(): idx += 1 continue tested = self._indexer.check_upper_lower(inverted_index, inside_list[idx]) if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys(): idx += 1 continue if tested[1] is True: words_to_add[tested[0]] = 0.0001 break elif tested[1] is 'replace': words_to_add[tested[0].upper()] = 0.0001 break idx += 1 doc_query_app.update(words_to_add) else: # dict_of_methods['parser'] = True doc_query_app = self.finished_dict(query, inverted_index) if len(doc_query_app) == 0: return [] dict_relevant_docs = self._relevant_docs_from_posting(doc_query_app, posting) ranked_doc_ids = Ranker.rank_relevant_docs(dict_relevant_docs , posting, documents, doc_query_app) n_relevant = len(ranked_doc_ids) return n_relevant, ranked_doc_ids