def search_by_query(query,cluster_no=-1,no_of_results=20): print(cluster_no) conn = sqlite3.connect("./server/database/web.db") cur = conn.cursor() query_vector=sent_vectorizer(query) urls=[] ranks=[] similarity=[] rows=[] if cluster_no<0 or cluster_no>100: cur.execute("SELECT url,rank_d30 FROM global_data LIMIT 100000") else: cur.execute("SELECT url,rank_d30 FROM global_data where cluster=?",(str(cluster_no),)) try: for row,v in zip(cur,get_all_vectors(cluster_no)): urls.append(row[0]) ranks.append(row[1]) similarity.append(scipy.spatial.distance.cosine(query_vector,v)) except : pass print("fetching top sites...") rank_list=[] top_idx = np.argsort(similarity)[0:no_of_results] top_urls=[{"url":urls[i],"rank":ranks[i],"similarity":similarity[i]} for i in top_idx] for i in top_idx: if(ranks[i]!=None): rank_list.append((0.5*similarity[i])+(0.5*ranks[i])) else: rank_list.append(sys.maxsize) final_idx=np.argsort(rank_list) final_rank=[{'url':top_urls[i]["url"],'rank':top_urls[i]["rank"]} for i in final_idx] return final_rank
def search_by_query(query, cluster_no=None, no_of_results=20): conn = sqlite3.connect( "/content/drive/My Drive/Colab Notebooks/web_update.db") cur = conn.cursor() query_vector = sent_vectorizer(query) urls = [] ranks = [] similarity = [] rows = [] embedding = get_all_vectors(cluster_no) k = 0 if cluster_no == None: cur.execute("SELECT url,rank_d1 FROM global_data LIMIT 10000") rows = cur.fetchall() else: cur.execute("SELECT url,rank_d1 FROM global_data where cluster=?", (str(cluster_no), )) rows = cur.fetchall() for row in rows: urls.append(row[0]) ranks.append(row[1]) try: similarity.append( scipy.spatial.distance.cosine(query_vector, embedding[k])) k += 1 except: print("error") print("fetching top sites...") rank_list = [] top_idx = np.argsort(similarity)[0:no_of_results] top_urls = [{ "url": urls[i], "rank": ranks[i], "similarity": similarity[i] } for i in top_idx] for i in top_idx: if (ranks[i] != None): rank_list.append((0.5 * similarity[i]) + (0.5 * ranks[i])) else: rank_list.append(sys.maxsize) final_idx = np.argsort(rank_list) final_rank = [{ 'url': top_urls[i]["url"], 'rank': top_urls[i]["rank"] } for i in final_idx] return final_rank
def giveUrlInfo(url): try: urlInfo = checkUrlInDb(url) if(len(urlInfo['new_url_vector'])!=0): print("url in databse") cluster_no = urlInfo['cluster_no'] new_url_vector = urlInfo['new_url_vector'] else: print("url not in database") content = preprocess(get_text_content(url)) new_url_vector=sent_vectorizer(content,modelg) cluster_no=kmeans.predict([new_url_vector])[0] print("cluster",cluster_no) # print("cluster_no") # print(new_url_vector) return dict({"cluster_no":cluster_no,"urlvector":new_url_vector}) except Exception as e: print('giveUrlInfo error',e) return e
def giveUrlInfo(url): try: urlInfo = checkUrlInDb(url) if (len(urlInfo['new_url_vector']) != 0): print("url in databse") cluster_no = urlInfo['cluster_no'] new_url_vector = urlInfo['new_url_vector'] else: print("url not in database") #content=get_text_content(url).split() content_dict = get_text_content(url) print("check aman") status = content_dict['status'] print("status", status) if (status == 'fail'): return dict({ "status": "failure", "status_code": content_dict['content'] }) # print("content:",content_dict['content']) else: content = content_dict['content'].split() # status=content_dict['status'] # if(status=='failure'): # status=content #content = get_content(url) #print(content) new_url_vector = sent_vectorizer(content, modelg) cluster_no = kmeans.predict([new_url_vector])[0] print("cluster", cluster_no) # print("cluster_no") # print(new_url_vector) return dict({ "cluster_no": cluster_no, "urlvector": new_url_vector, 'status': "200 ok" }) except Exception as e: print('giveUrlInfo error', e) print(traceback.format_exc()) return e