def get_plain_text(self, url): text = '' try: page_content = WebHelper.get_page_content_from_url(url) if page_content is None: print('[Error]', url) return '' page_content = page_content.decode('utf-8') soup = BeautifulSoup(page_content, 'lxml') # kill all script and style elements for script in soup(["script", "style"]): script.extract() text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (' '.join(line.strip().split()) for line in text.splitlines()) text = '\n'.join(lines) text = os.linesep.join([s for s in text.splitlines() if s]) time.sleep(random.randint(1, 3)) except Exception as e: if isinstance(e, KeyboardInterrupt): exit() else: print(e) return text
def get_search_page_by_name(cls, name): """ get html content of the search page as a bing_result of the given name :param name: name to be searched on search engine :return: html content of search page """ name = str(name).replace(' ', '+') search_url = cls.__SEARCH_ROOT_URL__ + name return WebHelper.get_page_content_from_url(search_url)
__RESULT_DIR_PATH__ = '../google_result/' __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q=' class BingHelper(SearchHelper): __parser__ = BingPageHTMLParser __RESULT_DIR_PATH__ = '../bing_result/' __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q=' if __name__ == '__main__': # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail') # resultFile = open('bing_result.html', 'w') # resultFile.write(bing_result) # # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result) # for url, title in title_url_dict: # print url, title content = WebHelper.get_page_content_from_url( 'http://www.google.com/search?q=jie+tang+tsinghua+email') result = open('bing_result.html', 'w') result.write(content) # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'}) # auth = urllib2.HTTPBasicAuthHandler() # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler) # urllib2.install_opener(opener) # print 'ready to open' # conn = urllib2.urlopen('http://www.google.com') # print conn.read()
__parser__ = GooglePageHTMLParser __RESULT_DIR_PATH__ = '../google_result/' __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q=' class BingHelper(SearchHelper): __parser__ = BingPageHTMLParser __RESULT_DIR_PATH__ = '../bing_result/' __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q=' if __name__ == '__main__': # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail') # resultFile = open('bing_result.html', 'w') # resultFile.write(bing_result) # # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result) # for url, title in title_url_dict: # print url, title content = WebHelper.get_page_content_from_url('http://www.google.com/search?q=jie+tang+tsinghua+email') result = open('bing_result.html', 'w') result.write(content) # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'}) # auth = urllib2.HTTPBasicAuthHandler() # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler) # urllib2.install_opener(opener) # print 'ready to open' # conn = urllib2.urlopen('http://www.google.com') # print conn.read()