def rank_ngram(query): ngram_analyse = [] clean_query = cleaning(query) length_query = len(clean_query) if length_query is 1: return {}, {} docs = [] for line in open('ngram.json', 'r'): docs.append(json.loads(line)) rank_sum_dict = {} for doc in docs: rank_sum_dict[doc['url']] = 0 query_gram2 = [] for index, word in enumerate(clean_query): if index is length_query - 1: break two_word = word + ' ' + clean_query[index + 1] query_gram2.append(two_word) #print(query_gram2) query_gram3 = [] for index, word in enumerate(clean_query): if index is length_query - 2: break three_word = word + ' ' + clean_query[index + 1] + ' ' + clean_query[index + 2] query_gram3.append(three_word) #print(query_gram3) for doc in docs: for query_word_gram2 in query_gram2: for doc_word in doc['gram2']: if NGram.compare(query_word_gram2, doc_word) >= 0.5: rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 1 ngram_analyse.append(doc_word) ngram_analyse = list(set(ngram_analyse)) for doc in docs: for query_word_gram3 in query_gram3: for doc_word in doc['gram3']: if NGram.compare(query_word_gram3, doc_word) >= 0.5: rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 3 ngram_analyse.append(doc_word) ngram_analyse = list(set(ngram_analyse)) #print(rank_sum_dict) rank_sum_dict_unsorted = {} for key, value in rank_sum_dict.items(): if value > 0: rank_sum_dict_unsorted[key] = value return rank_sum_dict_unsorted, ngram_analyse
def query_structure(search_query): query_synonym = [] query_suggestion = [] word_synonyms = [] word_suggestions=[] synonym_docs=[] for line in open('synonym1500.json', 'r'): synonym_docs.append(json.loads(line)) #print(search_query) clean_query = cleaning(search_query) for word in clean_query: for doc in synonym_docs: if word in doc['words']: query_synonym = query_synonym + doc['words'] #print(query_synonym) for word in clean_query: word_suggestions = dictionary_spelling.suggest(word) query_suggestion = query_suggestion+word_suggestions clean_query_root = [] for word in clean_query: root_word = stem(word) clean_query_root.append(root_word) query_synonym_root = [] for word in query_synonym: root_word = stem(word) query_synonym_root.append(root_word) query_suggestion_root = [] for word in query_suggestion: root_word = stem(word) query_suggestion_root.append(root_word) return clean_query_root, query_synonym_root, query_suggestion_root
def search(search_query): query_synonym = [] query_suggestion = [] word_synonyms = [] word_suggestions = [] synonym_docs = [] for line in open('synonym.json', 'r'): synonym_docs.append(json.loads(line)) print(search_query) clean_query = cleaning(search_query) for word in clean_query: for doc in synonym_docs: if word in doc['words']: query_synonym = query_synonym + doc['words'] print(query_synonym) for word in clean_query: word_suggestions = dictionary_spelling.suggest(word) query_suggestion = query_suggestion + word_suggestions clean_query_root = [] for word in clean_query: root_word = stem(word) clean_query_root.append(root_word) query_synonym_root = [] for word in query_synonym: root_word = stem(word) query_synonym_root.append(root_word) query_suggestion_root = [] for word in query_suggestion: root_word = stem(word) query_suggestion_root.append(root_word) title_rank = {} client = MongoClient() db = client.webSE docs = db.indexed_repo_title.find({}) for doc in docs: title_match_value = 0 for index, title_word_root in enumerate(doc['title_root']): for query_word_root in clean_query_root: if title_word_root == query_word_root: title_match_value = title_match_value + doc[ 'idf_word_list'][index] for query_word_synonym_root in query_synonym_root: if title_word_root == query_word_synonym_root: title_match_value = title_match_value + 0.75 * doc[ 'idf_word_list'][index] for query_word_suggestion_root in query_suggestion_root: if title_word_root == query_word_suggestion_root: title_match_value = title_match_value + 0.5 * doc[ 'idf_word_list'][index] title_rank[doc['url']] = title_match_value title_sorted_rank_dict = {} for key, value in sorted(title_rank.items(), key=operator.itemgetter(1), reverse=True): if value > 0: title_sorted_rank_dict[key] = value title_sorted_rank_list = [] for key in title_sorted_rank_dict: title_sorted_rank_list.append(key) docs = db.indexed_repo_title.find({'url': {'$in': title_sorted_rank_list}}) link_title_sorted_dict = {} doc_dict = {} for doc in docs: doc_dict[doc['url']] = doc['title'] for link in title_sorted_rank_list: for (key, value) in doc_dict.items(): if link == key: link_title_sorted_dict[key] = value print(link_title_sorted_dict) return link_title_sorted_dict #search("arrides")
def search(search_query): dict = enchant.Dict("en_UK") dictionary = PyDictionary() print(search_query) clean_query = cleaning(search_query) synonyms = [] synonyms_final = [] synonyms_final_root = [] for word in clean_query: if dict.check(word): synonyms = dictionary.synonym(word) synonyms_final = synonyms_final + synonyms length_clean_query = len(clean_query) for word in clean_query[:length_clean_query + 1]: suggestions = dict.suggest(word) for i in suggestions: clean_query.append(i) print(clean_query) clean_query_root = [] for word in clean_query: root_word = stem(word) clean_query_root.append(root_word) for word in synonyms_final: root_word = stem(word) synonyms_final_root.append(root_word) clean_query_root = clean_query_root + synonyms_final_root title_rank = {} keyword_rank = {} client = MongoClient() db = client.webSE docs = db.keyword.find({}) for doc in docs: title_match_value = 0 for index, title_keyword_root in enumerate(doc['title_root']): for query_keyword_root in clean_query_root: #print("title_keyword : "+ title_keyword+", query_keyword : "+query_keyword) if title_keyword_root == query_keyword_root: #print("check loop") title_match_value = title_match_value + doc[ 'title_relative'][index] #print(title_match_value) title_rank[doc['url']] = title_match_value print(title_rank) docs = db.keyword.find({}) for doc in docs: keyword_match_value = 0 for index, keyword_root in enumerate(doc['keyword_root']): for query_keyword_root in clean_query_root: if keyword_root == query_keyword_root: keyword_match_value = keyword_match_value + doc[ 'keyword_relative'][index] keyword_rank[doc['url']] = keyword_match_value #print(keyword_rank) combined_rank_dict = {} for key, value in title_rank.items(): combined_rank_dict[key] = keyword_rank[key] + value #print(combined_rank_dict) combined_rank_dict_sorted = {} for key, value in sorted(combined_rank_dict.items(), key=operator.itemgetter(1), reverse=True): if value > 0: combined_rank_dict_sorted[key] = value #print(combined_rank_dict_sorted) combined_rank_sorted = [] for key in combined_rank_dict_sorted: combined_rank_sorted.append(key) #print(combined_rank_sorted) docs = db.data.find({'url': {'$in': combined_rank_sorted}}) final_link_title_dict = {} #print(combined_rank_sorted) doc_dict = {} for doc in docs: doc_dict[doc['url']] = doc['title'] for link in combined_rank_sorted: for (key, value) in doc_dict.items(): if link == key: final_link_title_dict[key] = value return final_link_title_dict #search("")
import nltk from pymongo import MongoClient from general import cleaning client = MongoClient() db = client.webSE docs = db.data1500.find({}) for doc in docs[570:575]: content = doc['content'] list_content = cleaning(content) str_content = ' '.join(list_content) text = nltk.word_tokenize(str_content) x = nltk.pos_tag(text) for i, j in x: if j is 'NNP' or j is 'NNPS': print(i, j)
from pymongo import MongoClient from general import cleaning from stemming.porter2 import stem client = MongoClient() db = client.webSE docs_count = db.data1500.find({}).count() docs = db.data1500.find({}) title_combined_unique_root = [] for doc in docs: title = doc['title'] title_clean = cleaning(title) title_clean_root = [] for word in title_clean: root_word = stem(word) title_clean_root.append(root_word) title_clean_unique_root = list(set(title_clean_root)) for word in title_clean_unique_root: title_combined_unique_root.append(word) #print(title_combined_unique_root) docs = db.data1500.find({}) for doc in docs:
from ngram import NGram from pymongo import MongoClient from general import cleaning from collections import Counter from stemming.porter2 import stem client = MongoClient() db = client.webSE docs = db.data1500.find({}) #print("check1") for doc in docs: content = doc['content'] content_clean = cleaning(content) content_clean_root = [] for word in content_clean: root_word = stem(word) content_clean_root.append(root_word) #print("check2") object_top1 = Counter(content_clean_root) top_root_object = object_top1.most_common(5) top_root_words = [] for key, val in top_root_object: top_root_words.append(key) #print("check3") gram2 = [] gram3 = [] len_content_clean = len(content_clean)
from pymongo import MongoClient from collections import Counter from general import cleaning from stemming.porter2 import stem client = MongoClient() db = client.webSE docs = db.data.find({}) #print(docs[0]) title_combined = [] for doc in docs: title = doc['title'] title_clean = cleaning(title) for word in title_clean: title_combined.append(word) total_len_title = len(title_combined) #print(total_len_title) #print(title_combined) docs = db.data.find({}) for i in docs: content = i['content'] title = i['title'] title_clean = [] content_clean = [] keyword_relative = [] title_relative = []