def get_right_url(): list_site = ss.search_google("Hébergement Web tunisie") list_name = ss.get_name(list_site) datas = ss.get_data(list_site, list_name) list_of_urls = [] for xx in datas: try: getpage = requests.get(xx["url"]) getpage_soup = BeautifulSoup(getpage.text, 'html.parser') bool2 = False for link in getpage_soup.findAll('a'): x = link.get('href') bool2 = "domain" in x if (bool2 == True): doc = { "name_of_url": xx["name"], "the_right_url": link.get('href') } if (check_data_duplicate(list_of_urls, doc) == False): list_of_urls.append(doc) print( "---------------------------------------------------------" + xx["name"] + "----------------------------------------------------------------------" ) except: pass return list_of_urls
async def search(context): try: args = str(context.message.content).split(' ') results = search_google((' ').join(args[2:len(args) - 1]), int(args[-1])) for link in results: await context.send(link) except Exception as e: print(e)
def search_request(search, reddit, twitter, news): coms = ['n'] #tweets = s.search_twitter(search) urls = [] # urls = ["https://www.reddit.com/r/SFGiants/","https://www.reddit.com/r/Politics/"] <-- good way to test if out of searches if twitter == "true": coms += s.search_twitter(search) if reddit == "true": urls = s.search_google(search) coms += s.search_reddit(urls) if news == "true": coms += s.search_all_news(search) avg_sentiment, sample = s.analyze_text(coms, search) comment_length = len(coms) sites_searched = [reddit, twitter, news] word_count = s.word_count(coms) search_db_entry = UserSearch( search=search, urls=urls, avg_sentiment=avg_sentiment, word_count=word_count, comments=comment_length, sites=sites_searched ) db.session.add(search_db_entry) db.session.commit() query_history = UserSearch.query.filter_by(search=search.strip()).all() query_history_sentiment = dict() sorted_query_history = [] for query in query_history: query_history_sentiment[query.time] = query.avg_sentiment sorted_query_history.append(query.time) sorted_query_history = sorted(sorted_query_history) output_dict = { "urls": urls, "avg_sentiment": avg_sentiment, "word_count": word_count, "comments": comment_length, "sample": sample, "sites": sites_searched, "query_history": sorted_query_history, "query_history_sentiment": query_history_sentiment } return output_dict
print("collection deleted") """ def static_data(name_collection,name_collection_static_data): print("hello") mycol = mydb[name_collection] mycol_static_data = mydb[name_collection_static_data] for d1 in mycol.find(): if(check_data_duplicate_db_by_name(d1,"data_static")==True): mydict={"name":d1["name"],"total":0} insert_document(mydict,"data_static") """ list_site = ss.search_google("Hébergement Web tunisie") list_name = ss.get_name(list_site) datas = ss.get_data(list_site, list_name) for xx in datas: the_last_id = show_the_last_id("application_conccurent") xx['id'] = the_last_id + 1 if (check_data_duplicate_db(xx, "application_conccurent") == False): insert_document(xx, "application_conccurent") """ ##insert_document_list(datas,"application_conccurent") ##the_last_id=show_the_last_id("application_conccurent") ##delete_all_doc_in_coll("application_conccurent")
def search(keywords): list_site = ss.search_google(keywords) list_name = ss.get_name(list_site) tasks = ss.get_data(list_site, list_name) task = tasks return task