def show_pk_article(request, pk, keyword): a = Article.objects.get(pk=pk) tokens = string_to_tokens(a.abstract) keyword_stem = string_to_tokens(keyword) pos = [] for w in tokens: if keyword_stem[0][0] == w[0]: pos.append(w[1]) _abstract = "" # a bug happened because \n for j in a.abstract: if (j != '\n'): _abstract += j else: _abstract += ' ' render_dict = { 'title': a.title, 'abstract': _abstract.split(' '), 'keyword': keyword, 'pos': pos } return render(request, 'search_engine/show_an_article.html', render_dict)
def create_revindex(request): a = Article.objects.all() word_freq = {} for i in a: for j in string_to_tokens(i.abstract): if j[0] != 'nan': w = Word(context = j[0], pos_in_a_article = j[1]) w.save() w.position.add(i) return HttpResponse('Create Reverse index success')
def create_stem_freq(request): a = Article.objects.all() word_freq = {} for i in a: for j in string_to_tokens(i.abstract): if j[0] != 'nan': if j[0] in word_freq.keys(): word_freq[j[0]] += 1 else: word_freq[j[0]] = 0 # sort the dict by value word_freq = {k : v for k, v in sorted(word_freq.items(), key=lambda item: item[1], reverse=True)} #save to database for j in word_freq.keys(): sf = StemFreq(word = j, frequency = word_freq[j]) sf.save() return HttpResponse('StemFreq created')
def get_subset(keyword): from search_engine.parsing_utils import string_to_tokens corrected_keyword = keyword # query for the data subset keywords_cleaned = string_to_tokens(corrected_keyword) articles_pk = [] # get the target articles' pk for i in keywords_cleaned: w = Word.objects.filter(context=i[0]) for j in w: mode = j.position.get() # append article pk if mode.pk not in articles_pk: articles_pk.append(mode.pk) words_freq = {} words = [] # get the target articles' words for i in articles_pk: w = Word.objects.filter(position__id=i) for j in w: if j.context not in words_freq.keys(): words_freq[j.context] = 1 else: words_freq[j.context] += 1 words.append(j.context) # sort the dict by value words_freq = { k: v for k, v in sorted( words_freq.items(), key=lambda item: item[1], reverse=True) } return words_freq, words
def stemmed(keyword): from search_engine.parsing_utils import string_to_tokens keywords_cleaned = string_to_tokens(keyword) for i in keywords_cleaned: return i[0]
def show_articles(request, first=False): # POST => process the form data from user # if request.method == 'POST': if not first and request.method == 'POST': form = WordForm(request.POST) if form.is_valid(): # parse and clean (stemming..etc) the keywords keywords_cleaned = string_to_tokens(form.cleaned_data['keywords']) # search keywords in db (Model => Word) all_words = [] for i in keywords_cleaned: # retrive all word q_set = Word.objects.filter(context=i[0]) for j in q_set: temp = {} temp['word'] = j.context # keyword name temp['pos'] = j.pos_in_a_article # position in the doc # look up in which docs # get many to many table q_art = j.position.get() # when calculating the which doc, we do ( pk - current_firstpk ) q3 = Article.objects.filter()[0] article_firstpk = q3.pk temp['docs'] = q_art.pk - article_firstpk all_words.append(temp) # now we render the show page all_articles = Article.objects.all() len_article = len(all_articles) #count the num of articles arts = [] _str = "" # a bug happened because \n for i in all_articles: for j in i.abstract: if (j != '\n'): _str += j else: _str += ' ' arts.append(_str) _str = "" arts = [i.split(' ') for i in arts] # articles break into words # count sentence num tot_sc = 0 sep_sc = [] for sc in all_articles: tot_sc += count_sent(sc.abstract) sep_sc.append(count_sent(sc.abstract)) # count the num of words tot_words = 0 len_sep_words = [] for wo in arts: tot_words += len(wo) len_sep_words.append(len(wo)) # make sure which docs has keywords # format : [doc1, doc2, doc3] (type int) key_docs = [] for i in all_words: if i['docs'] not in key_docs: key_docs.append(i['docs']) # make sure where the keywords are in the coresponding doc # format : {doc1 : [pos1, pos2], doc2 : [pos3, pos4]} (type int) key_pos = {} for i in all_words: if i['docs'] not in key_pos.keys(): key_pos[i['docs']] = [ i['pos'], ] else: key_pos[i['docs']].append(i['pos']) form = WordForm() #count words = { 'form': form, 'len_article': len_article, 'tot_sc': tot_sc, 'sep_sc': sep_sc, 'len_words': tot_words, 'sep_words': len_sep_words, 'articles': arts, 'num_result': len(all_words), 'keylines': key_docs, 'keywords': key_pos } # structure of words # { # 'articles' : a list of articles to show on template, # 'keylines' : a list of which line has words to highlight, # 'keywords' : a dict of which word to highlight in the corresponding line, # key is int , value is list # } # debug # return JsonResponse({'keylines' : key_docs , 'keywords' : key_pos}) return render(request, 'search_engine/show_articles.html', words) else: form = WordForm() all_articles = Article.objects.all() len_article = len(all_articles) #count the num of articles arts = [i.abstract.split(' ') for i in all_articles] # articles break into words # count the num of words tot_words = 0 len_sep_words = [] # count sent num tot_sc = 0 sep_sc = [] for sc in all_articles: tot_sc += count_sent(sc.abstract) sep_sc.append(count_sent(sc.abstract)) for wo in arts: tot_words += len(wo) len_sep_words.append(len(wo)) words = { 'form': form, 'articles': [i.abstract.split(' ') for i in all_articles], 'tot_sc': tot_sc, 'sep_sc': sep_sc, 'len_article': len_article, 'len_words': tot_words, 'sep_words': len_sep_words, } # return HttpResponse(loader.get_template('search_engine/show_articles.html').render(words , request)) return render(request, 'search_engine/show_articles.html', words)
def get_keywords(request): # POST => process the form data from user if request.method == 'POST': form = WordForm(request.POST) if form.is_valid(): # parse and clean (stemming..etc) the keywords keywords_cleaned = string_to_tokens(form.cleaned_data['keywords']) # search keywords in db (Model => Word) all_words = [] for i in keywords_cleaned: # retrive all word q_set = Word.objects.filter(context=i[0]) for j in q_set: temp = {} temp['word'] = j.context # keyword name temp['pos'] = j.pos_in_a_article # position in the doc # look up in which docs # get many to many table q_art = j.position.get() # when calculating the which doc, we do ( pk - current_firstpk ) q3 = Article.objects.filter()[0] article_firstpk = q3.pk temp['docs'] = q_art.pk - article_firstpk all_words.append(temp) # now we render the show page all_articles = Article.objects.all() # make sure which docs has keywords # format : [doc1, doc2, doc3] (type int) key_docs = [] for i in all_words: if i['docs'] not in key_docs: key_docs.append(i['docs']) # make sure where the keywords are in the coresponding doc # format : {doc1 : [pos1, pos2], doc2 : [pos3, pos4]} (type int) key_pos = {} for i in all_words: if i['docs'] not in key_pos.keys(): key_pos[i['docs']] = [ i['pos'], ] else: key_pos[i['docs']].append(i['pos']) words = { 'articles': [i.abstract.split(' ') for i in all_articles], 'num_result': len(all_words), 'keylines': key_docs, 'keywords': key_pos } return HttpResponse( loader.get_template('search_engine/show_articles.html').render( words, request)) # return HttpResponse(all_words) # GET => create blank form else: form = WordForm() return render(request, 'search_engine/search_page.html', {'form': form})
def zipf_ct(request): # form = WordForm() keyword = 'ct' corrected_keywords = [ 'mask', 'children', '2003', '2019', 'pneumothorax', 'pneumonia', 'image', 'adult', 'bat', 'immune', 'wuhan', 'vaccin' ] return_dict = {} # query for the data subset for ck in corrected_keywords: keywords_cleaned = string_to_tokens(ck) articles_pk = [] # get the target articles' pk w = Word.objects.filter(context=keywords_cleaned[0][0]) for j in w: mode = j.position.get() # append article pk if mode.pk not in articles_pk: articles_pk.append(mode.pk) words = {} # get the target articles' words for i in articles_pk: w = Word.objects.filter(position__id=i) for j in w: if j.context not in words.keys(): words[j.context] = 1 else: words[j.context] += 1 # sort the dict by value words = { k: v for k, v in sorted( words.items(), key=lambda item: item[1], reverse=True) } # stemming words _words = list(words.keys()) _freq = list(words.values()) keyword_rank = 0 # calculate keyword ranking in each subset for i in range(len(_words)): if _words[i] == keyword: keyword_rank = i return_dict['{}_words'.format(ck)] = _words return_dict['{}_freq'.format(ck)] = _freq return_dict['{}_subset_article_num'.format(ck)] = len(_words) return_dict['{}_rank'.format(ck)] = keyword_rank # return JsonResponse(return_dict) return render(request, 'search_engine/chart_ct.html', return_dict)
def zipf_search(request, subset=0): # for spelling correction from spellchecker import SpellChecker spell = SpellChecker() form = WordForm() corrected_keyword = '' search_title = '' # user search if request.method == 'POST': form = WordForm(request.POST) # get search word if form.is_valid(): # parse and clean (stemming..etc) the keywords origin_keyword = form.cleaned_data['keywords'] misspelled = spell.unknown([origin_keyword]) corrected_keyword = '' search_title = '' if len(misspelled) != 0: for w in misspelled: corrected_keyword = spell.correction(w) search_title = 'Do you mean \"{}\". Showing search results for {}'.format( corrected_keyword, corrected_keyword) else: corrected_keyword = origin_keyword search_title = 'search results for {}'.format(origin_keyword) # predefine subset elif subset == 0: corrected_keyword = 'mask' search_title = 'search results for mask' elif subset == 1: corrected_keyword = '2005' search_title = 'search results for 2005' elif subset == 2: corrected_keyword = 'children' search_title = 'search results for children' # query for the data subset keywords_cleaned = string_to_tokens(corrected_keyword) titles = {} # get the target articles names articles_pk = [] # get the target articles' pk articles_raw_words = {} # get the target articles' abstract raw words for i in keywords_cleaned: w = Word.objects.filter(context=i[0]) for j in w: mode = j.position.get() title = mode.title # append article pk if mode.pk not in articles_pk: articles_pk.append(mode.pk) # append raw abstract words for k in mode.abstract.split(' '): if k not in articles_raw_words.keys(): articles_raw_words[k] = 1 else: articles_raw_words[k] += 1 # append article title if title not in titles.keys(): titles[title] = 1 else: titles[title] += 1 # sort the dict by value titles = { k: v for k, v in sorted( titles.items(), key=lambda item: item[1], reverse=True) } articles_raw_words = { k: v for k, v in sorted( articles_raw_words.items(), key=lambda item: item[1], reverse=True) } titles_name = list(titles.keys()) titles_freq = list(titles.values()) # get sort article pk articles_pk = [] for i in titles_name: a = Article.objects.filter(title=i)[0] articles_pk.append(a.pk) words = {} # get the target articles' words for i in articles_pk: w = Word.objects.filter(position__id=i) for j in w: if j.context not in words.keys(): words[j.context] = 1 else: words[j.context] += 1 # sort the dict by value words = { k: v for k, v in sorted( words.items(), key=lambda item: item[1], reverse=True) } # calculate zipf # of = OriginFreq.objects.filter(word == ) # sf = StemFreq.objects.all() title = str(corrected_keyword) # stemming words top100_words = list(words.keys())[:100] freq = list(words.values()) # calculate stemmed covid19 set word rank stem_covid_set_freq = [] stem_covid_set_rank = [] stem_covid_set_first_pk = StemFreq.objects.filter()[0].pk for i in top100_words: a = StemFreq.objects.get(word=i) stem_covid_set_freq.append(a.frequency) stem_covid_set_rank.append(a.pk - stem_covid_set_first_pk) # origin words top100_words_raw = list(articles_raw_words.keys())[:100] freq_raw = list(articles_raw_words.values()) # calculate original covid19 set word rank origin_covid_set_freq = [] origin_covid_set_rank = [] origin_covid_set_first_pk = OriginFreq.objects.filter()[0].pk for i in top100_words_raw: a = OriginFreq.objects.get(word=i) origin_covid_set_freq.append(a.frequency) origin_covid_set_rank.append(a.pk - origin_covid_set_first_pk) return_dict = { 'search_title': search_title, 'titles_name': titles_name, 'titles_freq': titles_freq, 'chart_title': title, 'form': form, 'top_words': top100_words, 'freq': freq, 'top_words_raw': top100_words_raw, 'freq_raw': freq_raw, 'stem_covid_set_freq': stem_covid_set_freq, 'stem_covid_set_rank': stem_covid_set_rank, 'origin_covid_set_freq': origin_covid_set_freq, 'origin_covid_set_rank': origin_covid_set_rank, 'article_pk': articles_pk } return render(request, 'search_engine/chart_search.html', return_dict)