示例#1
0
 def main(self):
     for url in self.url_list:
         new_url = url.format(page='')
         try:
             content = self.get_content(new_url)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         # 获取max page
         content_list = content.split('\n')
         max_page = 0
         for c in content_list:
             if 'var maxPage = ' in c:
                 start_index = c.find('=') + 1
                 max_page = int(c[start_index: -1].strip()) - 1
                 break
         try:
             self.detail_spider(url)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         while self.flag != 1 and max_page != 0:
             max_page_str = '_' + str(max_page)
             print url.format(page=max_page_str)
             try:
                 self.detail_spider(url.format(page=max_page_str))
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 continue
             max_page -= 1
         self.flag = 0
     print self.article_data_list
     insert_news_to_mysql(self.article_data_list)
示例#2
0
 def pic_main(self):
     for url in self.pic_url_list:
         page = 1
         while self.flag != 1:
             news_url = url.format(page=page)
             try:
                 self.pic_detail_spider(news_url)
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 print traceback.format_exc()
                 continue
             page += 1
         self.flag = 0
     insert_news_to_mysql(self.article_data_list)
示例#3
0
 def main(self):
     for url in self.url_list:
         page = 1
         while self.flag != 1:
             try:
                 self.sina(url.format(page=page))
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 print traceback.format_exc()
             page += 1
             if page >= 2:
                 break
         self.flag = 0
     insert_news_to_mysql(self.article_data_list)
示例#4
0
 def pic_main(self):
     for url in self.pic_url_list:
         try:
             content = self.get_content(url)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         soup = BeautifulSoup(content)
         for data in soup.select("#item-list a"):
             tmp_dict = dict()
             news_url = data['href']
             try:
                 news_body = self.get_content(news_url)
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 continue
             news_soup = BeautifulSoup(news_body)
             title = get_tag_html(news_soup, '#contentE h2')
             pub_time = get_tag_html(news_soup, '[class~=timt]')
             pub_time = pub_time.replace(u'日期:', '').strip()
             pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00')
             if pub_timmestamp < self.start_timestamp:
                 self.flag = 1
                 break
             tmp_dict['title'] = title
             # 获取文章内容
             tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]')
             # 获取图片
             img_list = list()
             for img in news_soup.select("#picPlayerTab img"):
                 img_title = img['alt']
                 img_url = img['src'].replace('st', '')
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
             tmp_dict['img_list'] = img_list
             tmp_dict['source'] = news_url
             self.article_data_list.append(tmp_dict)
     insert_news_to_mysql(self.article_data_list)