def info(self):
     information = {
         "title": [],
         "article_url": [],
         "type": [],
         "publish_time": [],
         "institution": [],
         "author": [],
         "content": []
     }
     for i in range(1, self._get_pages() + 1):
         bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.yesterday +
                                  "&p=" + str(i))
         contents = bsObj.find("div", {
             "class": "main"
         }).find("table").find_all("tr")[2:]
         for content in contents:
             article = content.find_all("td")
             article_info = article[1].find("a")
             article_url = article_info.attrs["href"]
             information["title"].append(
                 article_info.attrs["title"].encode('latin1').decode(
                     'gb2312', 'ignore'))
             information["article_url"].append(article_url)
             information["type"].append(
                 article[2].text.encode('latin1').decode(
                     'gb2312', 'ignore'))
             information["publish_time"].append(self.yesterday)
             information["institution"].append(article[4].find("a").find(
                 "div").find("span").text.encode('latin1').decode(
                     'gb2312', 'ignore'))
             information["author"].append(article[5].find("div").find(
                 "span").text.encode('latin1').decode('gb2312', 'ignore'))
             try:
                 content_bs = s_utils.conn_get(article_url)
                 content_text = content_bs.find("div", {
                     "class": "blk_container"
                 }).find("p").text
                 content_text = content_text.encode('latin1').decode(
                     'gb2312', 'ignore')
                 content_text = re.sub("\n+", "\n", content_text)
                 content_text = re.sub(" +", " ", content_text)
                 information["content"].append(content_text)
             except Exception as e:
                 self.log.info("\n{}".format(e))
                 information["content"].append("")
                 pass
         self.log.info("the {} page scrapy successful..".format(i))
         time.sleep(0.01)
     df = pd.DataFrame(information,
                       columns=[
                           "title", "article_url", "type", "publish_time",
                           "institution", "author", "content"
                       ])
     return df
示例#2
0
 def _url_for_pdf(self, url, retry=10):
     bs_obj = s_utils.conn_get(url)
     for i in range(retry):
         try:
             pdf_url = bs_obj.find("div", {"class": "detail-header"}).find("h1").find("span").find("a").attrs["href"]
             return pdf_url
         except Exception as e:
             self.log.info(e)
             time.sleep(0.01)
             pass
 def _get_pages(self, retry=3):
     bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.yesterday)
     for i in range(retry):
         try:
             page_num = \
                 bsObj.find("div", {"class": "page"}).find("tr").find("td").find("div",
                                                                                 {"class": "pagebox"}).find_all(
                     "span", {
                         "class": "pagebox_next"})[-1].find("a").attrs["onclick"]
             page_num = re.search("(\d+)", page_num)
             page_num = page_num.group()
             return int(page_num)
         except Exception as e:
             if i != retry - 1:
                 self.log.info(e)
                 pass
             else:
                 self.log.info(
                     "==========>No data day:{}<==========".format(
                         self.yesterday))
                 sys.exit()
 def info(self):
     information = {
         "title": [],
         "article_url": [],
         "type": [],
         "publish_time": [],
         "institution": [],
         "author": [],
         "content": []
     }
     first_record = s_utils.get_first_info(cons.get_first_sina_report,
                                           cons.research_report_table_name,
                                           column_name='article_url')
     page = 1
     while True:
         bsObj = s_utils.conn_get(cons.SIAN_REPORT_URL + self.today +
                                  "&p=" + str(page))
         contents = bsObj.find("div", {
             "class": "main"
         }).find("table").find_all("tr")[2:]
         for content in contents:
             article = content.find_all("td")
             article_info = article[1].find("a")
             article_url = article_info.attrs["href"]
             if article_url != first_record:
                 information["title"].append(
                     article_info.attrs["title"].encode('latin1').decode(
                         'gb2312', 'ignore'))
                 information["article_url"].append(article_url)
                 information["type"].append(
                     article[2].text.encode('latin1').decode(
                         'gb2312', 'ignore'))
                 information["publish_time"].append(self.today)
                 information["institution"].append(
                     article[4].find("a").find("div").find(
                         "span").text.encode('latin1').decode(
                             'gb2312', 'ignore'))
                 information["author"].append(article[5].find("div").find(
                     "span").text.encode('latin1').decode(
                         'gb2312', 'ignore'))
                 try:
                     content_bs = s_utils.conn_get(article_url)
                     content_text = content_bs.find("div", {
                         "class": "blk_container"
                     }).find("p").text
                     content_text = content_text.encode('latin1').decode(
                         'gb2312', 'ignore')
                     content_text = re.sub("\n+", "\n", content_text)
                     content_text = re.sub(" +", " ", content_text)
                     information["content"].append(content_text)
                 except Exception as e:
                     self.log.info("\n{}".format(e))
                     information["content"].append("")
                     pass
             else:
                 break
         time.sleep(0.01)
         if len(information['article_url']) % 40 == 0:
             page += 1
         else:
             break
     if len(information['article_url']) > 0:
         df = pd.DataFrame(information,
                           columns=[
                               "title", "article_url", "type",
                               "publish_time", "institution", "author",
                               "content"
                           ])
         return df
     else:
         self.log.info("No data now.")
         sys.exit()