示例#1
0
    def crawl_main_page(self, keyword):
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(ec.presence_of_element_located((By.ID, 'query')))
        except:
            CustomLogging.log_to_file('搜狗搜索打开失败', LogType.ERROR)

        # 高级设置
        elem = self.driver.find_element_by_id('settings')
        ActionChains(self.driver).move_to_element(elem).perform()
        time.sleep(1)
        self.driver.find_element_by_id('advanced-search').click()

        self.driver.find_element_by_xpath('//input[@name="q"]').send_keys(
            keyword)
        self.driver.find_element_by_xpath(
            '//input[@name="sitequery"]').send_keys(self.site)
        self.driver.find_element_by_xpath(
            '//a[@uigs-id="adv_time-sort"]').click()
        self.driver.find_element_by_xpath(
            '//input[@id="adv-search-btn"]').click()
        # search_keyword = '{0} site:{1}'.format(keyword, self.site)
        # self.driver.find_element_by_id('query').click()
        # self.driver.find_element_by_id('query').send_keys(search_keyword + Keys.ENTER)

        return self.crawl_search_results()
示例#2
0
    def crawl_main_page(self, keyword):
        try:
            self.driver.set_page_load_timeout(10)
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')
            pass

        try:
            self.driver.find_element_by_class_name('snp-btn-close').click()
        except NoSuchElementException:
            pass

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'suggest01_input')))
        except:
            CustomLogging.log_to_file('新浪财经页面打开失败', LogType.ERROR)

        # self.driver.find_elements_by_class_name('ds_button')[-1].click()

        self.driver.execute_script(
            'document.getElementsByClassName("ds_button")[1].click();')
        self.driver.find_elements_by_class_name('dsl_cont')[
            -1].find_element_by_xpath('.//p[contains(text(), "新闻")]').click()

        self.driver.find_element_by_id('suggest01_input').click()
        self.driver.find_element_by_id('suggest01_input').send_keys(keyword +
                                                                    Keys.ENTER)

        self.crawl_search_results()
示例#3
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find('div', attrs={'id': 'ctrlfscont'}).text
         return full_content
     except Exception:
         CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
         return
示例#4
0
    def crawl_main_page(self, keyword):
        self.driver.get(self.url)
        try:
            self.wait.until(ec.presence_of_element_located((By.ID, 'keyword')))
        except:
            CustomLogging.log_to_file('人民网主页打开失败', LogType.ERROR)

        self.driver.find_element_by_id('keyword').clear()
        self.driver.find_element_by_id('keyword').send_keys(keyword + Keys.ENTER)

        self.crawl_search_results()
示例#5
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find('div', attrs={'id': 'rwb_zw'}).text  # rwb_zw
         return full_content
     except Exception:
         try:
             full_content = bs.find('div', attrs={'class': re.compile('(show_text)|(con)|(gray box_text)')})
             return full_content
         except:
             CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
             return
示例#6
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find(
             'div',
             attrs={
                 'id': re.compile('(Cnt-Main-Article-QQ)|(ArticleCnt)')
             }).text
         return full_content
     except Exception:
         CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                   LogType.ERROR)
         pass
示例#7
0
def download_as_html(url, encoding='utf-8'):
    # proxies = {'http': 'CNCKGTMG01.atrapa.deloitte.com:80', 'https': 'CNCKGTMG01.atrapa.deloitte.com:80'}
    # auth = HttpNtlmAuth(r'atrapa\emmayli', 'Gnw201807!')
    session.mount(url, retry_adr)
    try:
        r = session.get(url)
        if r.encoding != 'ISO-8859-1' or (r.encoding == 'ISO-8859-1'
                                          and encoding == 'gbk'):
            r.encoding = encoding
        return {'text': r.text, 'url': r.url, 'encoding': r.encoding}
    except:
        CustomLogging.log_to_file('download failed: {0}'.format(url),
                                  LogType.ERROR)
        return ''
示例#8
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find(
             'div',
             attrs={
                 'class':
                 re.compile('(ArticleBlogText)|(art_contextBox)|(article)')
             }).text  # ArticleBlogText  #art_contextBox  #article
         return full_content
     except Exception:
         CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                   LogType.ERROR)
         pass
示例#9
0
    def write_to_excel(self, filepath):
        # self.db_handler.close_conn()
        article_list = []
        while not self.q.empty():
            item = self.q.get()
            article_list.append(
                (self.name, item.url, item.title, item.short_description,
                 item.publish_date, item.full_content))

        if article_list:
            df = pd.DataFrame(article_list)
            xl_writer = pd.ExcelWriter(filepath, engine='openpyxl')
            if os.path.exists(filepath):
                wb = load_workbook(filepath)
                xl_writer.book = wb
                xl_writer.sheets = dict([(ws.title, ws)
                                         for ws in wb.worksheets])
            df.to_excel(xl_writer,
                        index=False,
                        sheet_name=self.name,
                        header=[
                            'source', 'url', 'title', 'short_desc',
                            'issue_date', 'full_content'
                        ])
            try:
                xl_writer.save()
            except PermissionError:
                xl_writer_new = pd.ExcelWriter(filepath.replace(
                    '.xlsx', '{0}.xlsx'.format(str(int(time.time())))),
                                               engine='openpyxl')
                if os.path.exists(filepath):
                    wb = load_workbook(filepath)
                    xl_writer_new.book = wb
                    xl_writer_new.sheets = dict([(ws.title, ws)
                                                 for ws in wb.worksheets])
                df.to_excel(xl_writer_new,
                            index=False,
                            sheet_name=self.name,
                            header=[
                                'source', 'url', 'title', 'short_desc',
                                'issue_date', 'full_content'
                            ])
                xl_writer_new.save()
        else:
            print('找不到{0}年内任何与关键字 "{1}" 相关的内容'.format(self.year_range,
                                                      self.keyword))
            CustomLogging.log_to_file('搜索不到匹配记录', LogType.INFO)

        pass
示例#10
0
    def crawl_search_results(self):
        search_results = []

        try:
            self.wait.until(ec.presence_of_element_located((By.ID, 'searchlist')))
        except TimeoutException:
            CustomLogging.log_to_file('第一财经网搜索结果页错误', LogType.ERROR)

        exit_flag = 0
        start_index = 0
        while True:
            try:
                self.wait.until(ec.presence_of_all_elements_located((By.CLASS_NAME, 'f-db')))
            except TimeoutException:
                CustomLogging.log_to_file('文章列表加载失败', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name('f-db')

                for each_article in result_articles[start_index:]:
                    item = Entity()
                    item.publish_date = \
                        each_article.find_element_by_class_name('author').find_elements_by_tag_name('span')[
                            -1].text

                    if not in_date_range(conv_pub_date(item.publish_date, 'yicai'), self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.title = each_article.find_element_by_tag_name('h2').text
                    item.short_description = each_article.find_element_by_tag_name('p').text

                    if self.keyword not in item.title and self.keyword not in item.short_description:
                        continue

                    item.url = each_article.get_attribute('href')
                    threading.Thread(target=self.download_and_save_item, args=(item,)).start()

                if exit_flag == 1:
                    break

            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.ERROR)
                pass

            try:
                # next_page = self.wait.until(ec.visibility_of_element_located(
                #     (By.XPATH, '//button[@class="u-btn" and contains(text(), "加载更多内容")]')))
                # next_page = self.driver.find_element_by_xpath('//button[@class="u-btn" and contains(text(), "加载更多内容")]')
                # next_page.click()
                self.driver.execute_script('document.getElementsByClassName("u-btn")[0].click()')
                time.sleep(2)
                start_index += 20
            except TimeoutException:
                CustomLogging.log_to_file('全部页面加载完成', LogType.INFO)
                break

        return search_results
示例#11
0
    def crawl_main_page(self, keyword):
        try:
            self.driver.get(self.url)
        except TimeoutException:
            pass

        try:
            self.wait.until(ec.presence_of_element_located((By.ID, 'contentInput_0')))
        except:
            CustomLogging.log_to_file('证券时报网页面打开失败,不能定位搜索框元素', LogType.ERROR)

        self.driver.find_element_by_id('contentInput_0').clear()
        self.driver.find_element_by_id('contentInput_0').send_keys(keyword + Keys.ENTER)

        self.crawl_search_results()
示例#12
0
    def crawl_main_page(self, keyword):
        self.driver.get(self.url)
        try:
            self.wait.until(
                ec.presence_of_element_located((By.CLASS_NAME, 'ifengSS')))
        except:
            CustomLogging.log_to_file('凤凰财经网页面打开失败', LogType.ERROR)

        # self.driver.execute_script('document.getElementsByClassName("btn-1NI76BXl clearfix")[0].click()')
        self.driver.find_element_by_class_name('ifengSS').click()

        self.driver.find_element_by_class_name('ifengSS').send_keys(keyword +
                                                                    Keys.ENTER)

        self.crawl_search_results()
示例#13
0
    def crawl_main_page(self, keyword):
        try:
            self.driver.get(self.url)
        except:
            CustomLogging.log_to_file('加载页面过慢,停止加载,继续下一步操作', LogType.INFO)
            self.driver.execute_script('window.stop()')
        try:
            self.wait.until(ec.presence_of_element_located((By.ID, 'searchkeys')))
        except:
            CustomLogging.log_to_file('第一财经网页面打开失败', LogType.ERROR)

        self.driver.find_element_by_id('searchkeys').clear()
        self.driver.find_element_by_id('searchkeys').send_keys(keyword + Keys.ENTER)

        self.crawl_search_results()
示例#14
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find('div', attrs={
             'class': 'g-articl-text'
         }).text
         return full_content
     except Exception:
         try:
             full_content = bs.find('article').text
             return full_content
         except:
             # CustomLogging.log_to_file('{0}\n'.format(traceback.format_exc()), LogType.ERROR)
             CustomLogging.log_to_file(
                 '页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
             return ''
示例#15
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     [s.extract() for s in bs('script')]
     try:
         full_content = bs.find(
             'div',
             attrs={
                 'class':
                 re.compile(
                     '(main-text)|(article-con)|(post-detail-text )|(rich_media_content )|(art_main)'
                 )
             }).text
         return full_content
     except Exception:
         CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                   LogType.ERROR)
示例#16
0
 def parse_html(self, url, html):
     bs = BeautifulSoup(html, 'lxml')
     try:
         full_content = bs.find('div',
                                attrs={
                                    'class':
                                    re.compile('(article-content)|(pcb)')
                                }).text
         return full_content
     except Exception:
         try:
             full_content = bs.find('div', attrs={'id': 'news_main'}).text
             return full_content
         except Exception:
             CustomLogging.log_to_file(
                 '页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR)
示例#17
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('东方财富网搜索页面加载失败', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.publish_date = each_article.find_element_by_class_name(
                        'g').text

                    threading.Thread(target=self.download_and_save_item,
                                     args=(each_article, )).start()

            except NoSuchElementException:
                print('没有搜索结果')
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                print('已经是最后一页')
                break

        return search_results
示例#18
0
    def crawl_search_results(self):
        exit_flag = 0
        index = 0
        while True:
            try:
                self.wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'fr')))
            except TimeoutException:
                CustomLogging.log_to_file('人民网搜索结果页面加载失败', LogType.ERROR)
                CustomLogging.log_to_file(traceback.format_exc(), LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_xpath('//div[@class="fr w800"]//ul')

                for each_article in result_articles:
                    item = Entity()
                    pub_date = each_article.find_elements_by_tag_name('li')[2].text

                    item.publish_date = re.search(re.compile(
                        '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d'),
                        pub_date).group()

                    if not in_date_range(conv_pub_date(item.publish_date, 'peoplecn'), self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.title = each_article.find_element_by_tag_name('a').text
                    item.short_description = each_article.find_elements_by_tag_name('li')[1].text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    item.url = each_article.find_element_by_tag_name('a').get_attribute('href')
                    threading.Thread(target=self.download_and_save_item, args=(item,)).start()

            except NoSuchElementException:
                break

            if exit_flag == 1:
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="show_nav_bar"]//a[contains(text(), "下一页")]')
                next_page.click()
                time.sleep(2)
            except NoSuchElementException:
                break
示例#19
0
    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(5)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop()')
        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'textMessage')))
        except:
            CustomLogging.log_to_file('和讯财经网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('textMessage').clear()
        self.driver.find_element_by_id('textMessage').send_keys(keyword +
                                                                Keys.ENTER)

        return self.crawl_search_results()
示例#20
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()
        try:
            self.wait.until(
                ec.presence_of_element_located((By.CLASS_NAME, 'result')))
        except TimeoutException:
            CustomLogging.log_to_file('中国经济网搜索结果页面加载失败', LogType.ERROR)

        while True:
            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'res-list')

                for each_article in result_articles:
                    item = Entity()
                    item.title = each_article.find_element_by_class_name(
                        'res-title').text
                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    try:
                        item.short_description = each_article. \
                            find_element_by_xpath('./div[@class="res-rich so-rich-news clearfix"]//*').text
                        item.publish_date = each_article.find_element_by_class_name(
                            'gray').text
                        continue
                    except NoSuchElementException:
                        pass

                    try:
                        item.short_description = each_article.find_element_by_class_name(
                            'res-desc').text
                        item.publish_date = ''
                    except NoSuchElementException:
                        item.short_description = ''
                        item.publish_date = ''

                    threading.Thread(target=self.download_and_save_item,
                                     args=(item, )).start()

            except NoSuchElementException:
                pass

            return search_results
示例#21
0
    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'searchKey')))
        except:
            CustomLogging.log_to_file('北京工商局网站打开失败', LogType.ERROR)

        self.driver.find_element_by_id('searchKey').click()
        self.driver.find_element_by_id('searchKey').send_keys(keyword)
        self.driver.find_element_by_xpath(
            '//span[contains(text(), "我要搜索")]').click()

        return self.crawl_search_results()
示例#22
0
    def download_and_save_item(self, item):
        '''
        根据item对象里的url下载html页面
        :param item: 新闻文章对应的实体
        :return:
        '''
        print('downloading url: %s' % item.url)
        try:
            if self.name in (
                    '人民网',
                    '中国证券网',
                    '中国经济网',
                    '腾讯财经',
            ) or (self.name == '千龙网' and
                  ('qndj' in item.url or 'bbs' in item.url)):
                ret = download.download_as_html(item.url, encoding='gbk')
                html = ret['text'].encode('gbk', 'ignore')
            elif self.name in ('每经网', ):
                ret = download.download_as_html(item.url)
                html = ret['text'].encode('utf-8', 'ignore')
            elif self.name in ('同花顺', ):
                ret = download.download_as_html(item.url, encoding='gbk')
                html = ret['text'].encode('gbk', 'ignore')
            else:
                ret = download.download_as_html(item.url)
                if ret['encoding'] == 'ISO-8859-1':
                    html = ret['text'].encode('ISO-8859-1', 'ignore')
                else:
                    html = ret['text']
            if html:
                item.url = ret['url']
                item.full_content = self.parse_html(item.url, html)
                # 存储
                # self.db_handler.save_article_info(item, self.case_id, )
                self.q.put(item)

            else:
                return
        except TypeError:
            CustomLogging.log_to_file(traceback.format_exc(), LogType.INFO)
示例#23
0
    def crawl_main_page(self, keyword):
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located(
                    (By.XPATH, '//input[@value="百度一下"]')))
        except:
            CustomLogging.log_to_file('百度搜索打开失败', LogType.ERROR)

        # 高级设置
        self.driver.find_element_by_name('q2').send_keys(keyword)
        self.driver.find_element_by_name('q6').send_keys(self.site)
        self.driver.find_element_by_name('rn').click()
        self.driver.find_element_by_xpath(
            '//select/option[@value="50"]').click()
        self.driver.find_element_by_xpath('//input[@value="百度一下"]').click()

        return self.crawl_search_results()
示例#24
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_all_elements_located(
                        (By.XPATH, '//div[@class="content"]//div')))
            except TimeoutException:
                CustomLogging.log_to_file('搜索结果出错', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_xpath(
                    '//div[@class="content"]//div[@class="news"]')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_id(
                        'essaypubtime').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}.(0[1-9]|1[0-2]).(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'bjgsj'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_id(
                        'essaycontent').text
                    item.title = each_article.find_element_by_id(
                        'essaytitlelinks').text

                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_xpath(
                        './/li[@id="essaytitlelinks"]/a').get_attribute("href")
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_class_name('next-page')
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results
示例#25
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()
        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_all_elements_located(
                        (By.CLASS_NAME, 'r-info')))
            except TimeoutException:
                CustomLogging.log_to_file('搜索结果为空', LogType.ERROR)

            result_articles = self.driver.find_elements_by_class_name('r-info')

            for each_article in result_articles:
                item = Entity()
                try:
                    pub_date = each_article.find_element_by_class_name(
                        'fgray_time').text
                except NoSuchElementException:
                    continue
                item.publish_date = re.search(
                    re.compile(
                        '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d:[0-5]\d'
                    ), pub_date).group()
                # 判断文章是否在指定年限内,如果不在指定年限则退出
                if not in_date_range(conv_pub_date(item.publish_date, 'sina'),
                                     self.year_range):
                    exit_flag = 1
                    # 跳出for循环
                    break
                item.short_description = each_article.find_element_by_class_name(
                    'content').text
                item.title = each_article.find_element_by_tag_name('h2').text

                # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条
                if self.keyword not in item.short_description and self.keyword not in item.title:
                    continue

                if item.title in self.titles:
                    continue
                else:
                    self.titles.append(item.title)

                item.url = each_article.find_element_by_xpath(
                    './/h2/a').get_attribute('href')
                threading.Thread(target=self.download_and_save_item,
                                 args=(item, )).start()

            # 跳出while循环
            if exit_flag == 1:
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagebox"]/a[@title="下一页"]')
                # next_page.click()
                self.driver.get(next_page.get_attribute('href'))
                # time.sleep(2)
            except NoSuchElementException:
                # print('已经是最后一页')
                break

        return search_results
示例#26
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results
示例#27
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.CLASS_NAME, 'results')))
            except TimeoutException:
                CustomLogging.log_to_file('每经网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_xpath(
                    '//div[@class="results"]/div[@class="rb" or @class="vrwrap"]'
                )

                for each_article in result_articles[1:]:
                    item = Entity()

                    item.publish_date = format_sougou_date(
                        each_article.find_element_by_xpath(
                            './/cite[contains(@id,"cacheresult_info_")]').text)
                    if each_article.get_attribute('class') == 'rb':
                        try:
                            article_cont = each_article.find_element_by_xpath(
                                './/div[contains(@id, "cacheresult_summary_")]'
                            )
                        except NoSuchElementException:
                            continue
                        short_description = article_cont.text
                        item.short_description = re.sub(
                            re.compile(
                                '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-'
                            ), '', short_description)
                        item.title = each_article.find_element_by_xpath(
                            './/a[contains(@id, "uigs_")]').text
                        if self.keyword not in item.short_description and self.keyword not in item.title:
                            continue

                        if item.publish_date == '':
                            try:
                                publish_date = each_article.find_element_by_xpath(
                                    './/div[contains(@id, "cacheresult_summary_")]/span'
                                ).text
                                item.publish_date = publish_date.replace(
                                    '年', '-').replace('月', '-').replace(
                                        '日', '').replace('-', '')
                            except NoSuchElementException:
                                continue
                    else:
                        item.title = each_article.find_element_by_class_name(
                            'vrTitle').text
                        try:
                            short_description = each_article.find_element_by_class_name(
                                'str_info').text
                        except NoSuchElementException:
                            continue
                        item.short_description = re.sub(
                            re.compile(
                                '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-'
                            ), '', short_description)

                        if self.keyword not in item.short_description and self.keyword not in item.title:
                            continue

                        if item.publish_date == '':
                            try:
                                publish_date = each_article.find_element_by_class_name(
                                    'gray-color').text
                                item.publish_date = publish_date.replace(
                                    '-', '').replace('年', '-').replace(
                                        '月', '-').replace('日', '')
                            except NoSuchElementException:
                                continue

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'sougou'),
                            self.year_range):
                        exit_flag = 1
                        break

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    url = each_article.find_element_by_xpath(
                        './/a[contains(@id, "sogou_snapshot_")]'
                    ).get_attribute('href')
                    item.url = urllib.parse.unquote(
                        url.split('&')[1].replace('url=', ''))

                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()
                if exit_flag == 1:
                    break

            except TimeoutException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_id('sogou_next')
                # self.driver.get(next_page.get_attribute('href'))
                next_page.click()
                time.sleep(2)
            except TimeoutException:
                self.driver.execute_script('window.stop();')
            except NoSuchElementException:
                break

        return search_results
示例#28
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        # 和讯文章
        try:
            wz_btn = self.driver.find_element_by_xpath(
                '//div[@class="searchRe-top-b"]/a[contains(text(), "文章")]')
            wz_btn.click()

            while True:
                try:
                    result_articles = self.driver.find_elements_by_xpath(
                        '//table[@class="stocktab mt6"]//tr')

                    for each_article in result_articles[1:]:
                        item = Entity()
                        item.publish_date = each_article.find_elements_by_tag_name(
                            'td')[3].text

                        if not in_date_range(
                                conv_pub_date(item.publish_date, 'hexun'),
                                self.year_range):
                            continue
                        item.title = each_article.find_elements_by_tag_name(
                            'td')[1].text
                        item.short_description = each_article.find_elements_by_tag_name(
                            'td')[2].text
                        if self.keyword not in item.short_description and self.keyword not in item.title:
                            continue

                        item.url = each_article.find_elements_by_tag_name(
                            'td')[1].find_element_by_tag_name(
                                'a').get_attribute('href')
                        threading.Thread(target=self.download_and_save_item,
                                         args=(item, )).start()
                except NoSuchElementException:
                    break
                try:
                    next_page = self.driver.find_element_by_xpath(
                        '//div[@class="hx_paging"]//a[contains(text(), "下一页")]'
                    )
                    next_page_class = self.driver.find_element_by_xpath(
                        '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..'
                    ).get_attribute('class')

                    if next_page_class == 'no_next':
                        break

                    next_page.click()
                    time.sleep(2)
                except:
                    break
        except NoSuchElementException:
            pass

        # 和讯新闻
        news_btn = self.driver.find_element_by_xpath(
            '//div[@id="headLayer"]/a[contains(text(), "新闻")]')
        news_btn.click()
        time.sleep(1)
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'searchResult')))
            except TimeoutException:
                CustomLogging.log_to_file('和讯财经新闻搜索结果加载失败', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'newslist-a')

                for each_article in result_articles:
                    item = Entity()
                    item.publish_date = \
                        each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[-1].text
                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'hexun_news'),
                            self.year_range):
                        continue

                    item.title = each_article.find_element_by_xpath(
                        './/span[@class="breakdiv"]/a').text
                    item.short_description = each_article.find_element_by_class_name(
                        'news-l-c').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    item.url = each_article.find_element_by_xpath(
                        './/span[@class="breakdiv"]/a').get_attribute('href')
                    threading.Thread(target=self.download_and_save_item,
                                     args=(item, )).start()

            except NoSuchElementException:
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="hx_paging"]//a[contains(text(), "下一页")]')
                next_page_class = self.driver.find_element_by_xpath(
                    '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..'
                ).get_attribute('class')

                if next_page_class == 'no_next':
                    break

                next_page.click()
                time.sleep(2)
            except:
                break

        # 和讯博客
        news_btn = self.driver.find_element_by_xpath(
            '//div[@class="search-rs-list-ty"]/a[contains(text(), "博客")]')
        news_btn.click()
        self.driver.find_element_by_id('s1_t').click()
        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'searchResult')))
            except TimeoutException:
                CustomLogging.log_to_file('和讯财经博客搜索结果加载失败', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'newslist-a')

                for each_article in result_articles:
                    item = Entity()
                    item.publish_date = \
                        each_article.find_element_by_class_name('news-l-t').find_elements_by_tag_name('span')[
                            -1].text
                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'hexun_blog'),
                            self.year_range):
                        exit_flag = 1
                        break

                    item.title = each_article.find_element_by_xpath(
                        './/span[@class="breakdiv"]/a').text
                    item.short_description = each_article.find_element_by_class_name(
                        'news-l-c').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    item.url = each_article.find_element_by_xpath(
                        './/span[@class="breakdiv"]/a').get_attribute('href')
                    threading.Thread(target=self.download_and_save_item,
                                     args=(item, )).start()

            except NoSuchElementException:
                break

            if exit_flag == 1:
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="hx_paging"]//a[contains(text(), "下一页")]')
                next_page_class = self.driver.find_element_by_xpath(
                    '//div[@class="hx_paging"]//a[contains(text(), "下一页")]/..'
                ).get_attribute('class')

                if next_page_class == 'no_next':
                    break

                next_page.click()
                time.sleep(2)
            except:
                break

        return search_results
示例#29
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located((By.ID, 'container')))
            except TimeoutException:
                CustomLogging.log_to_file('百度搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_xpath(
                    '//div[@class="result c-container "]')

                for each_article in result_articles:
                    item = Entity()
                    try:
                        item.publish_date = each_article.find_element_by_xpath(
                            './/span[contains(@class,"newTimeFactor_before_abs")]'
                        ).text.replace('-', '')
                    except NoSuchElementException:
                        continue
                    try:
                        article_cont = each_article.find_element_by_class_name(
                            'c-abstract')
                    except NoSuchElementException:
                        continue
                    short_description = article_cont.text
                    item.short_description = re.sub(
                        re.compile(
                            '[1-9]\d{3}年(0?[1-9]|1[0-2])月(0?[1-9]|[1-2][0-9]|3[0-1])日\s+-'
                        ), '', short_description)
                    item.title = each_article.find_element_by_class_name(
                        't').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'baidu'),
                            self.year_range):
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_xpath(
                        './/h3[@class="t"]//a').get_attribute('href')

                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

            except TimeoutException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_id(
                    'page').find_element_by_class_name('n')
                # self.driver.get(next_page.get_attribute('href'))
                next_page.click()
                time.sleep(2)
            except TimeoutException:
                self.driver.execute_script('window.stop();')
            except NoSuchElementException:
                break

        return search_results
示例#30
0
    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()
        try:
            self.wait.until(ec.presence_of_element_located(
                (By.XPATH, '//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]')))
            self.driver.find_element_by_xpath('//div[@id="search_result"]//a[contains(text(), "按更新时间排序")]').click()
        except:
            CustomLogging.log_to_file('证券时报搜索结果页打开失败', LogType.ERROR)

        exit_flag = 0
        page_num = 1
        while True:
            # 搜索结果只会显示100页
            if page_num == 100:
                break

            try:
                self.wait.until(ec.presence_of_element_located((By.ID, 'search_list')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_xpath('//div[@id="search_list"]//dl')

                for each_article in result_articles:
                    item = Entity()
                    item.publish_date = each_article.find_elements_by_tag_name('dd')[1].find_element_by_tag_name(
                        'span').text
                    # 判断文章是否在指定年限内,如果不在指定年限则退出
                    if not in_date_range(conv_pub_date(item.publish_date, 'STCN'), self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_elements_by_tag_name('dd')[0].text
                    item.title = each_article.find_element_by_tag_name('a').text

                    # 关键字过滤,如果关键在在文章标题和简述里都没出现,则判断下一条
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name('a').get_attribute('href')
                    threading.Thread(target=self.download_and_save_item, args=(item,)).start()

                # 跳出while循环
                if exit_flag == 1:
                    break
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页加载错误', LogType.ERROR)
            try:
                next_page = self.driver.find_element_by_class_name('next')
                next_page.click()
                page_num += 1
            except NoSuchElementException:
                # print('已经是最后一页')
                break

        return search_results