Пример #1
0
def search_by_query(query,cluster_no=-1,no_of_results=20):
    print(cluster_no)
    conn = sqlite3.connect("./server/database/web.db")
    cur = conn.cursor()
    query_vector=sent_vectorizer(query)
    urls=[]
    ranks=[]
    similarity=[]
    rows=[]
    if cluster_no<0 or cluster_no>100:
      cur.execute("SELECT url,rank_d30 FROM global_data LIMIT 100000")
    else:
      cur.execute("SELECT url,rank_d30 FROM global_data where cluster=?",(str(cluster_no),))
    
    try:
      
      for row,v in zip(cur,get_all_vectors(cluster_no)):
        urls.append(row[0])
        ranks.append(row[1])
        similarity.append(scipy.spatial.distance.cosine(query_vector,v))
       
    except :
          pass
    print("fetching top sites...")
    rank_list=[]
    top_idx = np.argsort(similarity)[0:no_of_results]
    top_urls=[{"url":urls[i],"rank":ranks[i],"similarity":similarity[i]} for i in top_idx]
    for i in top_idx:
      if(ranks[i]!=None):
        rank_list.append((0.5*similarity[i])+(0.5*ranks[i]))
      else:
        rank_list.append(sys.maxsize)
    final_idx=np.argsort(rank_list)
    final_rank=[{'url':top_urls[i]["url"],'rank':top_urls[i]["rank"]} for i in final_idx]
    return final_rank 
Пример #2
0
def search_by_query(query, cluster_no=None, no_of_results=20):
    conn = sqlite3.connect(
        "/content/drive/My Drive/Colab Notebooks/web_update.db")
    cur = conn.cursor()
    query_vector = sent_vectorizer(query)
    urls = []
    ranks = []
    similarity = []
    rows = []
    embedding = get_all_vectors(cluster_no)
    k = 0
    if cluster_no == None:
        cur.execute("SELECT url,rank_d1 FROM global_data LIMIT 10000")
        rows = cur.fetchall()
    else:
        cur.execute("SELECT url,rank_d1 FROM global_data where cluster=?",
                    (str(cluster_no), ))
        rows = cur.fetchall()

    for row in rows:
        urls.append(row[0])
        ranks.append(row[1])
        try:
            similarity.append(
                scipy.spatial.distance.cosine(query_vector, embedding[k]))
            k += 1
        except:
            print("error")
    print("fetching top sites...")
    rank_list = []
    top_idx = np.argsort(similarity)[0:no_of_results]
    top_urls = [{
        "url": urls[i],
        "rank": ranks[i],
        "similarity": similarity[i]
    } for i in top_idx]
    for i in top_idx:
        if (ranks[i] != None):
            rank_list.append((0.5 * similarity[i]) + (0.5 * ranks[i]))
        else:
            rank_list.append(sys.maxsize)
    final_idx = np.argsort(rank_list)
    final_rank = [{
        'url': top_urls[i]["url"],
        'rank': top_urls[i]["rank"]
    } for i in final_idx]
    return final_rank
Пример #3
0
def giveUrlInfo(url):
  try:
      urlInfo = checkUrlInDb(url)
      if(len(urlInfo['new_url_vector'])!=0):
        print("url in databse")
        cluster_no = urlInfo['cluster_no']
        new_url_vector = urlInfo['new_url_vector'] 
      else:
        print("url not in database")
        content = preprocess(get_text_content(url))
        new_url_vector=sent_vectorizer(content,modelg)
        cluster_no=kmeans.predict([new_url_vector])[0]
        print("cluster",cluster_no)
      # print("cluster_no")
      # print(new_url_vector)  
      return dict({"cluster_no":cluster_no,"urlvector":new_url_vector})
  except Exception as e:
      print('giveUrlInfo error',e)
      return e
Пример #4
0
def giveUrlInfo(url):
    try:
        urlInfo = checkUrlInDb(url)
        if (len(urlInfo['new_url_vector']) != 0):
            print("url in databse")
            cluster_no = urlInfo['cluster_no']
            new_url_vector = urlInfo['new_url_vector']
        else:
            print("url not in database")
            #content=get_text_content(url).split()
            content_dict = get_text_content(url)
            print("check aman")
            status = content_dict['status']
            print("status", status)
            if (status == 'fail'):
                return dict({
                    "status": "failure",
                    "status_code": content_dict['content']
                })
        # print("content:",content_dict['content'])
            else:
                content = content_dict['content'].split()
        # status=content_dict['status']
        # if(status=='failure'):
        #  status=content
        #content = get_content(url)
        #print(content)
            new_url_vector = sent_vectorizer(content, modelg)
            cluster_no = kmeans.predict([new_url_vector])[0]
            print("cluster", cluster_no)
        # print("cluster_no")
        # print(new_url_vector)
        return dict({
            "cluster_no": cluster_no,
            "urlvector": new_url_vector,
            'status': "200 ok"
        })
    except Exception as e:
        print('giveUrlInfo error', e)
        print(traceback.format_exc())
        return e