Пример #1
0
def catalog_url(url='http://www.meitun.com/'):
    # catalog_url is AJAX,use phantomJS
    driver = PhantomJS()
    driver.get(url)
    driver.maximize_window()
    mov_ele = driver.find_element_by_css_selector('.nav>ul>li:nth-child(1)')
    # the mouse move to the lazy layout element,and perform
    ActionChains(driver).move_to_element(mov_ele).perform()
    time.sleep(3)
    response = driver.page_source
    driver.quit()
    # use pyquery parser the page source,more quickly
    d = pq(response)
    return map(lambda x: 'http:' + pq(x).attr('href'), d.find('.cg-pdts a'))
Пример #2
0
 def on_start_again(self, url):
     driver = PhantomJS()
     driver.get(url)
     time.sleep(2)
     driver.maximize_window()
     t = driver.find_element_by_css_selector('.page-txt').text
     res_t = []
     if t:
         t = int(t.split('/')[1][:-1]) - 1  # get the page count
         # the count of page turning should be i-1
         while t:
             t -= 1
             move_ele = driver.find_element_by_css_selector('#next')
             ActionChains(driver).move_to_element(move_ele).click()
             time.sleep(1)
             res_t.append(driver.page_source)
     driver.quit()
     for item in res_t:
         self.step_first(item)
class CamaraCGCrawler(object):
    """ Camara CG Ementa Crawler """
    def __init__(self, starting_year):
        self.base_url = "http://187.115.174.90:8080/ScanLexWeb"
        self.starting_year = starting_year
        self.browser = None

    @staticmethod
    def get_ementa_id(published_date, ementa_type, ementa_doc_number,
                      ementa_situation):
        """ Return the Ementa Unique Id """
        return "%s#%s#%s#%s" % (datetime.strftime(
            published_date,
            "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation)

    def get_all_ementas_summary(self):
        """ Yield the next ementa information row """

        browser_table = self.browser.find_element_by_id(
            "frmMenu:tabEmentas_data")
        bs_ementa_table = BeautifulSoup(
            browser_table.get_attribute("innerHTML"))

        for row in bs_ementa_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 6:
                published_date = datetime.strptime(
                    cols[0].span.text.encode("utf-8"), "%d/%m/%Y")
                doc_number = int(cols[1].span.text.encode("utf-8"))
                title = cols[2].span.text.encode("utf-8")
                ementa_type = cols[3].span.text.encode("utf-8")
                ementa_situation = cols[4].span.text.encode("utf-8")
                details_js = cols[5].a['onclick'].encode("utf-8")

                if published_date > datetime.now():
                    continue

                yield published_date, doc_number, title, ementa_type, ementa_situation, details_js

    def get_ementa_details(self, ementa_details_js):
        """ Crawl the second ementa page """

        # Waiting...
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:j_idt13_content")))
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:tabProponentes")))

        # Get Ementail Details
        bs_ementa_details = BeautifulSoup(self.browser \
            .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML"))

        rows = bs_ementa_details.find_all("tr")

        source = rows[3].td.text
        main_theme = rows[7].td.text
        sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y")
        approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y")
        process_number = int(rows[15].td.text or "-1")
        autograph_number = int(rows[19].td.text or "-1")
        process_year = int(rows[21].td.text or "-1")
        has_image = rows[23].td.text == "Sim"

        # Get Proponent names
        bs_proponent = BeautifulSoup(
            self.browser.find_element_by_id(
                "frmfuncao:tabProponentes").get_attribute("innerHTML"))

        proponents = ",".join(
            [col.text for col in bs_proponent.find_all("td")])

        return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \
            autograph_number, process_year, has_image

    def next_ementa(self, select_curs):
        """ Iterate in the years onwards and collect all the ementas """

        try:
            LOGGER.info("Opening Browser")
            self.browser = PhantomJS()

            LOGGER.info("GET [%s]", self.base_url)
            self.browser.maximize_window()

            cur_year = int(datetime.now().year)

            # Define the initial collection year
            select_curs.execute(
                "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;")
            last_exec_year = select_curs.fetchone()
            if last_exec_year:
                collection_year = max(self.starting_year, last_exec_year[0])
            else:
                collection_year = self.starting_year

            all_proponents = [
                "ANDERSON MAIA", "Afonso Alexandre Régis",
                "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral",
                "Alexandre do Sindicato", "Antonio Pereira",
                "Antônio Alves Pimentel Filho", "Aragão Júnior",
                "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada",
                "Cassiano Pascoal", "Cozete Babosa",
                "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro",
                "Dr. Nunes", "Executivo", "Fabrinni Brito",
                "Fernando carvalho", "Francisco Dantas Lira",
                "Galego do Leite", "Inacio Falcao", "Ivan Batista",
                "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva",
                "José Marcos Raia ", "José Ribamar", "João Dantas",
                "Jóia Germano", "Laelson Patricio", "Lafite",
                "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral",
                "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso",
                "Metuselá Agra", "Miguel Rodrigues da Silva",
                "Miguel da Construção", "Napoleão Maracajá",
                "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias",
                "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú",
                "Renato Feliciano", "Rodolfo Rodrigues",
                "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba",
                "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila",
                "Tovar Correia Lima", "Vaninho Aragão",
                "Veneziano Vital do rego", "Walter Brito Neto", "Todos"
            ]

            while collection_year <= cur_year:

                for i_prop in range(len(all_proponents)):

                    ementa_prop = all_proponents[i_prop].decode("utf-8")

                    self.browser.get(self.base_url)

                    # Waiting...
                    WebDriverWait(self.browser, 30).until(
                        EC.element_to_be_clickable((By.ID, "frmMenu:button1")))

                    LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]",
                                collection_year, ementa_prop, i_prop + 1,
                                len(all_proponents))

                    # Set Year
                    year_field = self.browser.find_element_by_id("frmMenu:ano")
                    year_field.send_keys(collection_year)

                    # Set Proponent
                    proponent_field = self.browser.find_element_by_id(
                        "frmMenu:autoridade")
                    proponent_field.send_keys(ementa_prop)

                    # Submit the form
                    self.browser.find_element_by_id("frmMenu:button1").click()

                    # Waiting...
                    # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data")))
                    time.sleep(3)

                    for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary(
                    ):
                        ementa_id = self.get_ementa_id(published_date,
                                                       ementa_type,
                                                       document_number,
                                                       ementa_situation)

                        select_curs.execute("""
                            SELECT ementa_id
                            FROM ementas
                            WHERE ementa_id = '%s';
                            """ % ementa_id)

                        if not select_curs.fetchone():
                            # Run the details script
                            self.browser.execute_script(ementa_details_js)
                            ementa_source, proponents, main_theme, sys_enter_date, approval_date, \
                                process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js)

                            # Come back to the table page
                            self.browser.back()

                            # Waiting...
                            _ = WebDriverWait(self.browser, 60).until(
                                EC.visibility_of_element_located(
                                    (By.ID, "frmMenu:tabEmentas_data")))

                            yield ementa_id, published_date, ementa_type, document_number, title, \
                                ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \
                                approval_date, process_number, autograph_number, process_year, has_image

                LOGGER.info("DONE [%d]", collection_year)

                self.browser.back()

                collection_year += 1

        finally:
            if self.browser:
                self.browser.quit()
Пример #4
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
Пример #5
0
Created on Tue Aug  7 15:48:28 2018
@author: 肖
原网址:http://ac.qq.com/ComicView/index/id/521825/cid/1
,但是该网址拒绝查看源代码,通过观察进一步发现给原网址添加一个view-source:
就可以出现园代码了,即view-source:http://ac.qq.com/ComicView/index/id/521825/cid/1
"""

from selenium.webdriver import PhantomJS, DesiredCapabilities
import time
import re

header = DesiredCapabilities.CHROME.copy()  # DesiredCapabilities可以伪装谷歌浏览器
web = PhantomJS(desired_capabilities=header,
                executable_path='F:/phantomjs-2.1.1-windows/bin/phantomjs'
                )  # 需要设置PhantomJS的路径,否则无法运行
web.maximize_window()  # 设置浏览器屏幕最大化
web.get('http://ac.qq.com/ComicView/index/id/521825/cid/1')  # 获取网页
web.get_screenshot_as_file(
    './abc.png')  # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为abc.png

for page in range(1, 30):  # window.scrollTo(0,{})往下翻页
    web.execute_script('window.scrollTo(0,{})'.format(
        1080 *
        page))  # execute_script表示执行翻页的脚本,1080*1表示第一页,1080*2表示第二页,以此类推。。。
    time.sleep(1)
web.get_screenshot_as_file('./abc.png')  # 下载最后一页

pat = 'https://manhua.qpic.cn/vertical/0/(.*?)"'  # 通过正则获取图片地址
ls = re.compile(pat, re.S).findall(web.page_source)  # web.page_source表示源代码

import urllib.request as r