def search_by_query(query, cluster=-1, result=20, limit=100000): """ return *result* site based on search query from *cluster*, sorted according to rank """ embd = average_word_embedding(query) sites = sorted([(norm(row[1] - embd), row[0], row[3] if row[3] != -1 else DB_DEFAULT_RANK) for row in site_info_by_cluster(cluster, limit=limit)], key=lambda x: x[0]) return sorted([{ 'url': v[1], 'rank': v[2] } for v in sites[:result]], key=lambda x: x['rank'])
def get_similar_sites(embd, cluster=-1, top=10, improve=True, limit=11): """ return *top* sites similar to site having embedding *embd* if *improve* = True it will also look for *limit* neighbour of cluster Note -: Do not change default value of *limit* until necessary """ if cluster != -1 and improve: temp = list(neigh[cluster][:limit]) temp.append(cluster) cluster = temp sim = sorted([(norm(row[1] - embd), row[0]) for row in globaldata.site_info_by_cluster(cluster)], key=lambda x: x[0]) return [v[1] for v in sim[:top]]
def search_by_domain(query, cluster=-1, results=50, limit=100000): """ return *result* site based on search domain from *cluster*, sorted according to rank """ domains = [] for row in site_info_by_cluster(cluster, limit=limit): name = urlparse(row[0]).netloc for word in query: if word in name: domains.append({ 'url': row[0], 'rank': row[3] if row[3] != -1 else DB_DEFAULT_RANK }) domains.sort(key=lambda x: x['rank']) return domains[0:results]
def getClusterInfo(cluster_no): try: cluster_no -= 1 # beacuse from front end it consider clusterno (1-100)but in db(0-99) keywords = keywords_by_cluster(cluster_no) centroids = kmeans.cluster_centers_ sites = sorted([(norm(row[1] - centroids[cluster_no - 1]), row[0], row[3] if row[3] != -1 else DB_DEFAULT_RANK) for row in site_info_by_cluster(cluster_no)], key=lambda x: x[0]) final = sorted([{ 'url': v[1], 'rank': v[2] } for v in sites[:10]], key=lambda x: x['rank']) only_urls = [dict['url'] for dict in final] return {'keywords': keywords, 'urls': only_urls} except sqlite3.Error as error: print('error fetching data from site_info', error) return json.dumps([])