def search(request): start = time.time() query = request.GET.get('q', '') original = query page = request.GET.get('p', '') if not page: page = 1 else: page = int(page) query = checkSpelling(query) query = decompose(query, True) query = returnSynonyms(query) query = decompose(query, True) query = applySearchHistory(query) results = retrieveFromIndex(query, page) suggestions = createSuggestions(original, results[0]) pages = ceil(results[1] / eventbook_settings.PAGE_SIZE) processtime = time.time() - start pagestart = max(1, page - 5) pageend = min(pages, page + 5) context = {'documents': results[0], 'query': original, 'extendedquery': query, 'results': results[1], 'page': page, 'totalpages': pages, 'pages': range(pagestart, pageend + 1), 'prev': max(1, page - 1), 'next': min(pages, page + 1), 'processtime': round(processtime, 4), 'suggestions': suggestions} return render(request, 'querying/search.html', context)
def processAndSaveDoc(document): # try: if document: document.description = decompose(document.description, False); document = multiLabelClassification(document); document = clusterDocument(document); document = findDuplicate(document); document.save()
def vcspace(docTexts): #original text, a set/list doclists = [] rowlists = [] #n=0 tokenlist = [] for text in docTexts: text = decompose(text, False) #every text becomes a list list = getTokensFromText(text) doclists.append(list) for token in list: if token not in tokenlist: tokenlist.append(token) list = [] rowlists.append(list) for list in rowlists: #initialize i = 0 for i in range(0, len(tokenlist)): list.append(0) textNum = len(doclists) # number of rows tokenNum = len(tokenlist) # number of columns i = 0 #print(tokenlist) n = -1 for list in doclists: n = n + 1 for i in range(0, len(list)): if list[i] in tokenlist: index = tokenlist.index(list[i]) rowlists[n][index] = rowlists[n][index] + 1 k = 0 m = 0 Similarity = [] for list1 in rowlists: for list2 in rowlists: normText1 = 0 normText2 = 0 for k in range(0, tokenNum): normText1 = normText1 + list1[k] normText2 = normText2 + list2[k] #normText1=power(normText1,1.0/2) #normText2=power(normText2,1.0/2) normText1 = normText1**(1. / 2) normText2 = normText2**(1. / 2) Similarity.append(0) for k in range(0, tokenNum): Similarity[m] = Similarity[m] + list1[k] * list2[k] Similarity[m] = Similarity[m] / normText1 / normText2 #print(Similarity[m]) m += 1 return Similarity
def createSuggestions( query, documents ): ## input the original query and relevant documents(Top 5 documents retrieved in our case) doclists = [] rowlists = [] tokenlist = [] for document in documents: tokens = document[0].getAllTokensAsText() doclists.append(tokens) for token in tokens: if token not in tokenlist: tokenlist.append(token) rowlists.append([]) ## deal with query queryTokens = getTokensFromText(query) doclists.insert(0, queryTokens) rowlists.insert(0, []) for list in rowlists: # initialize i = 0 for i in range(0, len(tokenlist)): list.append(0) i = 0 # print(tokenlist) n = -1 for list in doclists: n = n + 1 for i in range(0, len(list)): if list[i] in tokenlist: index = tokenlist.index(list[i]) rowlists[n][index] = rowlists[n][index] + 1 score = {} for i in range(0, len(tokenlist) - 1): score[tokenlist[i]] = rowlists[0][i] for j in range(1, len(rowlists) - 1): score[tokenlist[i]] = score[tokenlist[i]] + 0.75 * rowlists[j][i] rankwords = sorted(score.items(), key=lambda map: map[1], reverse=True) # print(rankwords) ## load stopwords stopwords = open(eventbook_settings.PROJECT_ROOT + "common/SmartStoplist.txt") stop_words = [] for line in stopwords: if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) # print(stop_words) ## generate new query ## find words with score bigger than 1.5 and don't show in the original query and the stop_words. newrank = [] # print(len(query)) for i in range(0, len(tokenlist) - 1): if rankwords[i][1] >= 1.5 and rankwords[i][0] not in query and rankwords[i][0] not in stop_words: newrank.append(rankwords[i][0]) k = len(newrank) ## if there are no words in newrank, we don't have expanded query if k == 0: suggestions = None else: suggestions = [] queryTokens = getTokensFromText(query) # we only generate at most 5 new queries for i in range(0, min(k, 5)): tokens = getTokensFromText(newrank[i]) suggestion = decompose(newrank[i], False) # Only add the suggested word if it is not in the query yet if suggestion and suggestion != "" and not any(suggestion in s for s in queryTokens): suggestions.append(suggestion) # print(suggestions) return suggestions
def vcspace(docTexts): #original text, a set/list doclists=[] rowlists=[] #n=0 tokenlist=[] for text in docTexts: text=decompose(text, False) #every text becomes a list list=getTokensFromText(text) doclists.append(list) for token in list: if token not in tokenlist: tokenlist.append(token) list=[] rowlists.append(list) for list in rowlists: #initialize i=0 for i in range(0,len(tokenlist)): list.append(0) textNum=len(doclists) # number of rows tokenNum=len(tokenlist) # number of columns i=0 #print(tokenlist) n=-1 for list in doclists: n=n+1 for i in range(0,len(list)): if list[i] in tokenlist: index=tokenlist.index(list[i]) rowlists[n][index]=rowlists[n][index]+1 k=0 m=0 Similarity=[] for list1 in rowlists: for list2 in rowlists: normText1=0 normText2=0 for k in range(0,tokenNum): normText1=normText1+list1[k] normText2=normText2+list2[k] #normText1=power(normText1,1.0/2) #normText2=power(normText2,1.0/2) normText1=normText1**(1./2) normText2=normText2**(1./2) Similarity.append(0) for k in range(0,tokenNum): Similarity[m]=Similarity[m]+list1[k]*list2[k] Similarity[m]=Similarity[m]/normText1/normText2 #print(Similarity[m]) m+=1 return Similarity