示例#1
0
 def zhuHai(self):
     url = urls[3]
     driver.get(url)
     if WebDriverWait(driver, 10, 1).until(
             EC.presence_of_element_located(
                 (By.CSS_SELECTOR, "[class='gl-con']"))):
         html = driver.page_source
         soup = bs4(html, 'lxml')
         temp = soup.find('div', class_="gl-con").find_all('li')
     print(len(temp))
     for i in temp:
         info = []
         name = i.get_text().replace('· ', '')
         if name[-1] == '区':
             continue
         if name[0] == '市':
             name = '珠海' + name
         info.append(name)
         info.append('广东省')
         info.append('珠海')
         info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
         info.append(url)
         sql = whereNotInsert.main(tablename, t_eventField, info, 'name',
                                   info[0])
         save2db().save2db(sql)
示例#2
0
 def guangDong(self):
     url = urls[0]
     driver.get(url)
     if WebDriverWait(driver, 10, 1).until(
             EC.presence_of_element_located(
                 (By.CSS_SELECTOR, "[class='glbox']"))):
         html = driver.page_source
         soup = bs4(html, 'lxml')
         temp = soup.find('div', class_="con LinkList").find_all('li')
     for i in temp:
         info = []
         name = i.get_text()
         info.append(name)
         info.append('广东省')
         info.append('')
         info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
         info.append(url)
         sql = whereNotInsert.main(tablename, t_eventField, info, 'name',
                                   info[0])
         save2db().save2db(sql)
示例#3
0
 def shanTou(self):
     for i in range(1, 4):
         url = 'http://www.shantou.gov.cn/u/zzjg?ajax=&name=&deptType=' + str(
             i) + '&callback=handlerDept&currentPage=1&pageSize=40'
         html = req.get(url, headers=headers).content.decode('utf8')
         res = eval(html[12:-1])
         for i1 in res['data']:
             name = i1['name']
             info = []
             if name[-1] == '区':
                 continue
             if name[0] == '市':
                 name = '汕头' + name
             info.append(name)
             info.append('广东省')
             info.append('汕头')
             info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
             info.append(urls[4])
             sql = whereNotInsert.main(tablename, t_eventField, info,
                                       'name', info[0])
             save2db().save2db(sql)
示例#4
0
 def guangZhou(self):
     url = urls[1]
     driver.get(url)
     if WebDriverWait(driver, 10, 1).until(
             EC.presence_of_element_located(
                 (By.CSS_SELECTOR, "[class='main_border SinglePage']"))):
         html = driver.page_source
         soup = bs4(html, 'lxml')
         temp = soup.find('div',
                          class_="main_border SinglePage").find_all('td')
     print(len(temp))
     for i in temp:
         info = []
         name = i.get_text().replace('· ', '')
         info.append(name)
         info.append('广东省')
         info.append('广州')
         info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
         info.append(url)
         sql = whereNotInsert.main(tablename, t_eventField, info, 'name',
                                   info[0])
         save2db().save2db(sql)
示例#5
0
    def start_spider(self):
        url = self.get_newsUrl()
        if url is None:
            sys.exit()
        html = req.get(url, headers=headers).content.decode('utf8')
        soup = bs4(html, 'lxml')
        section_day = soup.find_all('section', id=re.compile("[0-9]*?"))
        section_monthYear = soup.find_all('section', class_="item_content")
        section_timelineHistory = soup.find_all('section',
                                                class_="tl_content_item")
        pageType = soup.find(class_="listview2").get_text()

        if pageType == '大事记365':
            # 多页
            if len(self.page_re.findall(html)) > 0:
                pass
            else:
                for i in section_day:
                    info = self.get_dayNewsDes(i, soup)
                    info.append(url)
                    info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
                    sql = whereNotInsert.main(tablename, t_eventField, info,
                                              'name', info[0])
                    self.save2db(sql)

        if pageType == '月度事件' or pageType == '年度事件':
            # 多页
            if len(self.page_re.findall(html)) > 0:
                urls = self.get_newsurlINpage(soup)
                for i in urls:
                    html = req.get(i, headers=headers).content.decode('utf8')
                    soup = bs4(html, 'lxml')
                    section_monthYear = soup.find_all('section',
                                                      class_="item_content")
                    # 循环每一条新闻
                    for i in section_monthYear:
                        info = self.get_monthYearNewsDes(i, soup)
                        info.append(url)
                        info.append(
                            datetime.datetime.now().strftime('%Y-%m-%d'))
                        sql = whereNotInsert.main(tablename, t_eventField,
                                                  info, 'name', info[0])
                        self.save2db(sql)
                    self.changeUrlType(i)
            else:
                for i in section_monthYear:
                    info = self.get_monthYearNewsDes(i, soup)
                    info.append(url)
                    info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
                    sql = whereNotInsert.main(tablename, t_eventField, info,
                                              'name', info[0])
                    self.save2db(sql)

        if pageType == '时间轴':
            # 多页
            if len(self.page_re.findall(html)) > 0:
                urls = self.get_newsurlINpage(soup)
                for i in urls:
                    html = req.get(i, headers=headers).content.decode('utf8')
                    soup = bs4(html, 'lxml')
                    section_timelineHistory = soup.find_all(
                        'section', class_="tl_content_item")
                    # 循环每一条新闻
                    for i in section_timelineHistory:
                        info = self.get_timelineNewsDes(i, soup)
                        info.append(url)
                        info.append(
                            datetime.datetime.now().strftime('%Y-%m-%d'))
                        sql = whereNotInsert.main(tablename, t_eventField,
                                                  info, 'name', info[0])
                    self.changeUrlType(i)
            else:
                for i in section_timelineHistory:
                    info = self.get_timelineNewsDes(i, soup)
                    info.append(url)
                    info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
                    sql = whereNotInsert.main(tablename, t_eventField, info,
                                              'name', info[0])

        if pageType == '历史事件':
            # 多页
            if len(self.page_re.findall(html)) > 0:
                urls = self.get_newsurlINpage(soup)
                for i in urls:
                    html = req.get(i, headers=headers).content.decode('utf8')
                    soup = bs4(html, 'lxml')
                    section_timelineHistory = soup.find_all(
                        'section', class_="tl_content_item")
                    for i in section_timelineHistory:
                        info = self.get_HistoryNewsDes(i, soup)
                        info.append(url)
                        info.append(
                            datetime.datetime.now().strftime('%Y-%m-%d'))
                        sql = whereNotInsert.main(tablename, t_eventField,
                                                  info, 'name', info[0])
                        self.save2db(sql)
                    self.changeUrlType(i)
            else:
                for i in section_timelineHistory:
                    info = self.get_HistoryNewsDes(i, soup)
                    info.append(url)
                    info.append(datetime.datetime.now().strftime('%Y-%m-%d'))
                    sql = whereNotInsert.main(tablename, t_eventField, info,
                                              'name', info[0])
                    self.save2db(sql)
        self.changeUrlType(url)