def get_search_results(keyword: str, page_number: int, proxy: ProxyStruct=None, result_per_page: int=GoogleConst.Result100, timeout=5, return_domain_home_only=True, use_forbidden_filter=True, days_ago=0, addtional_query_parameter: str="", country_code="us", use_browser=False) -> list: """ generic normal search, get a list of domains form page :param keyword: :param page_number: > 0 :param resultPerPage: :param timeout: :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name :param use_forbidden_filter: :param days_ago: specify how many days ago before when results were indexed. :return: """ assert page_number > 0, "page number should be greater than 0." page_range = GoogleCom.get_result_per_page_range() assert result_per_page in page_range, "result per page should be one of those values:" + str(page_range) sub_domain = "www" request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \ + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \ + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago) try: user_agent = WebRequestCommonHeader.webpage_agent if not use_browser: response = GoogleCom._get_response(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) if not response.status_code == 200: # if response.status_code == 503: # print(response.text) raise ConnectionRefusedError("error getting result, with status code:", response.status_code) result = response.text else: result = GoogleCom._get_response_browser(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) soup = bs4.BeautifulSoup(result) tags = soup.select(GoogleConst.SitePath) domains = [] for tag in tags: try: domain = tag.text.strip().replace(" ", "") if return_domain_home_only: domain = LinkChecker.get_root_domain(domain, use_www=False)[2] # get the link else: domain = LinkChecker.get_root_domain(domain, use_www=False)[3] if use_forbidden_filter and LinkChecker.is_domain_forbidden(domain): continue if len(domain) > 0: domains.append(domain) except: pass return domains except Exception as ex: print(ex) return None
def get_search_results(keyword: str, page_number: int, proxy: ProxyStruct = None, result_per_page: int = GoogleConst.Result100, timeout=5, return_domain_home_only=True, use_forbidden_filter=True, days_ago=0, addtional_query_parameter: str = "", country_code="us", use_browser=False) -> list: """ generic normal search, get a list of domains form page :param keyword: :param page_number: > 0 :param resultPerPage: :param timeout: :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name :param use_forbidden_filter: :param days_ago: specify how many days ago before when results were indexed. :return: """ assert page_number > 0, "page number should be greater than 0." page_range = GoogleCom.get_result_per_page_range() assert result_per_page in page_range, "result per page should be one of those values:" + str( page_range) sub_domain = "www" request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \ + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \ + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago) try: user_agent = WebRequestCommonHeader.webpage_agent if not use_browser: response = GoogleCom._get_response(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) if not response.status_code == 200: # if response.status_code == 503: # print(response.text) raise ConnectionRefusedError( "error getting result, with status code:", response.status_code) result = response.text else: result = GoogleCom._get_response_browser(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) soup = bs4.BeautifulSoup(result) tags = soup.select(GoogleConst.SitePath) domains = [] for tag in tags: try: domain = tag.text.strip().replace(" ", "") if return_domain_home_only: domain = LinkChecker.get_root_domain( domain, use_www=False)[2] # get the link else: domain = LinkChecker.get_root_domain(domain, use_www=False)[3] if use_forbidden_filter and LinkChecker.is_domain_forbidden( domain): continue if len(domain) > 0: domains.append(domain) except: pass return domains except Exception as ex: print(ex) return None