def get_next_level_page(self, soup): page_links = [] for link in soup.find_all('a'): if not link.get('href'): continue newlink = urljoin(self.page.url, link.get('href')) retlink = vc.various_check(newlink) if retlink: page_links.append(retlink) page_links = list(set(page_links)) for link in page_links: if not self.check_duplication(link): page = Page(link, self.page.depth + 1, self.page.score, ref=self.page.url) self.queue.en_queue(page)
def query(self): ''' send google search query and get the top 10 result as a list of URLs ''' if self.fake: return ['http://engineering.nyu.edu', 'http://www.nyu.edu'] my_referer = 'https://www.bing.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'referer': my_referer } ''' send query and return list of URLs ''' query_string = self.make_query_string() params = {'q': query_string} # make google HTTP request bing_url = "http://www.bing.com/search?" try: response = requests.get(bing_url, params=params, headers=headers, timeout=0.5) except Exception as e: return # parse page contents data = response.text soup = BeautifulSoup(data, 'html.parser') ret = [] for link in soup.find_all('h2'): for linka in link.find_all('a'): if linka.get('href'): newlink = urljoin(response.url, linka.get('href')) if newlink.startswith('http'): retlink = vc.various_check(newlink) if retlink: ret.append(retlink) # return array of URls in an array return list(set(ret))[:10]
def query(self): ''' send google search query and get the top 10 result as a list of URLs ''' if self.fake: return ['http://engineering.nyu.edu', 'http://www.nyu.edu'] my_referer='http://www.google.com' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'referer': 'http://www.google.com' } ''' send query and return list of URLs ''' query_string = self.make_query_string() params = {'q':query_string} # make google HTTP request google_url = "http://www.google.com/search?" try: response = requests.get(google_url, params=params, headers=headers, timeout=0.5) except Exception as e: self.error = 1 return # parse page contents data = response.text soup = BeautifulSoup(data, 'html.parser') ret = [] for link in soup.find_all('h3'): for linka in link.find_all('a'): if not linka.get('style'): if linka.get('href'): newlink = urljoin(response.url, linka.get('href')) retlink = vc.various_check(newlink) if retlink: ret.append(retlink) # return array of URls in an array return list(set(ret))[:10]