def get_plain_text(self, url): text = '' try: page_content = WebHelper.get_page_content_from_url(url) if page_content is None: print('[Error]', url) return '' page_content = page_content.decode('utf-8') soup = BeautifulSoup(page_content, 'lxml') # kill all script and style elements for script in soup(["script", "style"]): script.extract() text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (' '.join(line.strip().split()) for line in text.splitlines()) text = '\n'.join(lines) text = os.linesep.join([s for s in text.splitlines() if s]) time.sleep(random.randint(1, 3)) except Exception as e: if isinstance(e, KeyboardInterrupt): exit() else: print(e) return text
def get_search_page_by_name(cls, name): """ get html content of the search page as a bing_result of the given name :param name: name to be searched on search engine :return: html content of search page """ name = str(name).replace(' ', '+') search_url = cls.__SEARCH_ROOT_URL__ + name return WebHelper.get_page_content_from_url(search_url)
def get_next_level_url(self): # compute next level url, set prompt or next_level_url self.do_compute() # get next level url if self._next_level_url is not None: return self._next_level_url if self._prompt is not None: self._next_level_url = WebHelper.join_url(self.url, self._prompt) return self._next_level_url else: print "do_compute should set at least one value of [prompt, next_level_url]" return self.url
def __init__(self, url_or_result, need_authentication=False, user_name='huge', pass_word='file'): if type(url_or_result) is Result: self.url = url_or_result.url self.user = url_or_result.user self.password = url_or_result.password self.file_name = url_or_result.file if self.user is not None and self.password is not None: need_authentication = True self.result = url_or_result else: self.url = url_or_result self.user = user_name self.password = pass_word self.result = Result() if need_authentication: self.url, self.web_source = WebHelper.get_auth_url_content(self.url, self.user, self.password) else: self.url, self.web_source = WebHelper.get_final_url_content(self.url) self._prompt = None self._next_level_url = None
def meet_row_requirement(self, r): row = [self.table[i][r] for i in range(self.width)] row_filled = [] total = 0 for i in range(self.width): if row[i] == 1: total += 1 elif total != 0: row_filled.append(total) total = 0 if total != 0: row_filled.append(total) if row_filled == self.column_bar[r]: return True else: # print "Row %i from table: %s" % (r, str(row)) # print "Required: %s, Get: %s" % (str(self.vertical[r]), row_filled) return False def print_result(self): for i in range(self.height): print ''.join(map(lambda x: self.filled if x == 1 else self.unfilled, [self.table[j][i] for j in range(self.width)])) if __name__ == '__main__': file_web_url = 'http://www.pythonchallenge.com/pc/rock/warmup.txt' content = WebHelper.get_auth_web_source(file_web_url, 'kohsamui', 'thailand') sketch = Sketch(content) sketch.play_game()
__RESULT_DIR_PATH__ = '../google_result/' __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q=' class BingHelper(SearchHelper): __parser__ = BingPageHTMLParser __RESULT_DIR_PATH__ = '../bing_result/' __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q=' if __name__ == '__main__': # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail') # resultFile = open('bing_result.html', 'w') # resultFile.write(bing_result) # # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result) # for url, title in title_url_dict: # print url, title content = WebHelper.get_page_content_from_url( 'http://www.google.com/search?q=jie+tang+tsinghua+email') result = open('bing_result.html', 'w') result.write(content) # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'}) # auth = urllib2.HTTPBasicAuthHandler() # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler) # urllib2.install_opener(opener) # print 'ready to open' # conn = urllib2.urlopen('http://www.google.com') # print conn.read()
def __init__(self): self.api = APIHelper() self.data = DataManager() self.web = WebHelper()
class Fetcher(object): """API fetcher to complete tasks""" def __init__(self): self.api = APIHelper() self.data = DataManager() self.web = WebHelper() def fetch_single_topic(self, topic_id): topic_info = self.api.get_topic_info(topic_id) partial_member = self.data.member_of_topic(topic_info) stored_topic = self.data.find_topic(topic_id) new_topic = self.data.handle_topic(topic_info, topic_id) if stored_topic: new_topic['web_crawled'] = stored_topic['web_crawled'] else: new_topic['web_crawled'] = datetime.fromtimestamp(0) new_topic['click'] = 0 new_topic['favorite'] = 0 new_topic['thank'] = 0 self.data.update_topic_synced_state(new_topic['id'], False) self.data.upsert_topic(new_topic) self.data.upsert_member(partial_member) def fetch_new_topics(self): max_stored_topic_id = self.data.max_stored_topic_id topic_count = self.api.get_topic_count() if max_stored_topic_id >= topic_count: return for topic_id in range(max_stored_topic_id + 1, topic_count + 1): self.fetch_single_topic(topic_id) def fetch_replies_of_topic(self, topic_id): replies = self.api.get_replies(topic_id) if replies: self.data.update_topic_synced_state(topic_id, False) for reply in replies: partial_member = self.data.member_of_reply(reply) self.data.upsert_reply(self.data.handle_reply(reply, topic_id)) self.data.upsert_member(partial_member) def fetch_new_replies(self): max_stored_topic_id = self.data.max_stored_topic_id max_stored_topic_id_of_reply = self.data.max_stored_topic_id_of_reply if max_stored_topic_id_of_reply == 0: need_refetch_max_topic = False else: topic = self.data.find_topic(max_stored_topic_id_of_reply) replies = self.data.find_all_replies(max_stored_topic_id_of_reply) need_refetch_max_topic = not (replies.count() == topic['replies']) for topic_id in range( max_stored_topic_id_of_reply + 1 - int(need_refetch_max_topic), max_stored_topic_id + 1): if self.data.find_topic(topic_id=topic_id): self.fetch_replies_of_topic(topic_id) def fetch_single_topic_extras(self, topic_id): def upsert_counts(click, favorite, thank): self.data.upsert_topic_extras(topic_id, click, favorite, thank) self.data.update_topic_synced_state(topic_id, False) count = self.data.update_topic_web_crawled(topic_id, datetime.utcnow()) logging.info('Update topic {0} extras, count {1}'.format( topic_id, count)) web_extra = self.web.get_topic_extras(topic_id) if not web_extra: count = self.data.update_topic_web_crawled(topic_id, datetime.utcnow()) topic = self.data.find_topic(topic_id) if (not topic) or ('click' not in topic): upsert_counts(0, 0, 0) logging.info('Topic {0} extras is None, count {1}'.format( topic_id, count)) return for index, postscript in enumerate(web_extra.subtle_list): postscript = self.data.handle_postscript(postscript, topic_id, index + 1) self.data.upsert_postscript(postscript) upsert_counts(web_extra.click, web_extra.favorite, web_extra.thank) def fetch_all_topic_extras(self): while True: min_topic_id_need_postscript = self.data.min_topic_id_need_extras if min_topic_id_need_postscript > 0: try: self.fetch_single_topic_extras( min_topic_id_need_postscript) except ElasticsearchException as es_error: raise es_error except Exception as e: logging.error('Fetch single topic extras error: ' + str(e)) else: break def fetch_all_nodes(self): nodes = self.api.get_all_nodes() if nodes: for node in nodes: node['crawled'] = datetime.utcnow() self.data.upsert_node(node) logging.info('Fetching all node, count: ' + str(len(nodes))) def fetch_new_members(self): site_stats = self.api.get_site_stats() max_stored_member_id = self.data.max_stored_member_id if site_stats: member_max = site_stats['member_max'] for member_id in range(max_stored_member_id + 1, member_max + 1): new_member = self.api.get_member_info(member_id=member_id) if not new_member: continue new_member['crawled'] = datetime.utcnow() self.data.upsert_member(new_member) logging.info('Upsert member {0}, id {1}'.format( new_member['username'], new_member['id'])) else: new_member = self.api.get_member_info( member_id=self.data.max_stored_member_id + 1) while new_member: new_member['crawled'] = datetime.utcnow() self.data.upsert_member(new_member) logging.info('Upsert member {0}, id {1}'.format( new_member['username'], new_member['id'])) new_member = self.api.get_member_info( member_id=self.data.max_stored_member_id + 1) def fetch_stale_topics(self): stale_topics = self.data.stale_topics() if not stale_topics: return for topic in stale_topics: topic_id = topic['id'] self.fetch_single_topic(topic_id) if topic['web_crawled'] and topic[ 'recrawl'] and topic['web_crawled'] < topic['recrawl']: self.fetch_single_topic_extras(topic_id) self.fetch_replies_of_topic(topic_id) def sync_topic_to_es(self): waiting_topics = self.data.not_synced_topics() if not waiting_topics: return for topic in waiting_topics: self.data.update_topic_synced_state(topic['id'], True) self.data.es_update_assembled_topic(topic)
def create_image_from_web(img_url, user=None, password=None): if user is None or password is None: img_data = WebHelper.get_web_source(img_url) else: img_data = WebHelper.get_auth_web_source(img_url, user, password) return Image.open(StringIO(img_data))
def show_image_from_web(img_url, user=None, password=None): if user is None or password is None: img_data = WebHelper.get_auth_web_source(img_url) else: img_data = WebHelper.get_auth_web_source(img_url, user, password) ImageHelper.show_image_from_data(img_data)
__parser__ = GooglePageHTMLParser __RESULT_DIR_PATH__ = '../google_result/' __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q=' class BingHelper(SearchHelper): __parser__ = BingPageHTMLParser __RESULT_DIR_PATH__ = '../bing_result/' __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q=' if __name__ == '__main__': # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail') # resultFile = open('bing_result.html', 'w') # resultFile.write(bing_result) # # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result) # for url, title in title_url_dict: # print url, title content = WebHelper.get_page_content_from_url('http://www.google.com/search?q=jie+tang+tsinghua+email') result = open('bing_result.html', 'w') result.write(content) # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'}) # auth = urllib2.HTTPBasicAuthHandler() # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler) # urllib2.install_opener(opener) # print 'ready to open' # conn = urllib2.urlopen('http://www.google.com') # print conn.read()