Exemplo n.º 1
0
    def start_requests(self):
        response = parsel.Selector(
            requests.post(self.search_url, {
                'pageNo': '1',
                'keyword': 'PP粒进出口数据简析'
            }).text)
        items = response.xpath("//ul[@class='contentList']/li").getall()

        temp = code_verify(self.img_url, self.code_verify_url)
        while temp.text != 'true':
            self.count += 1
            print("第{}次识别出错。".format(self.count))
            temp = code_verify(self.img_url, self.code_verify_url)

        for item in items:
            time.sleep(2)
            url = parsel.Selector(item).xpath("//h2//a/@href").get()
            date = parsel.Selector(item).xpath(
                "//span[@class='date']//text()").get()
            data = {
                'username': self.username,
                'password': self.password,
                'target': url,
                'errorPaw': self.errorPaw
            }
            yield FormRequest(url=self.login_succeed_url,
                              formdata=data,
                              callback=self.parse,
                              meta={'date': date})
Exemplo n.º 2
0
    def start_requests(self):
        response = parsel.Selector(
            requests.post(self.search_url, {
                'pageNo': '1',
                'keyword': '聚丙烯粉料主要生产企业开工分析'
            }).text)
        url = response.xpath("//ul[@class='contentList']/li[1]//a/@href").get()
        date = response.xpath(
            "//ul[@class='contentList']/li[1]//span[@class='date']/text()"
        ).get()
        # 今日的日期会标记为红色,dom路径有所不同
        if date is None:
            response.xpath(
                "//ul[@class='contentList']/li[1]//span[@class='date']/font/text()"
            ).get()

        temp = code_verify(self.img_url, self.code_verify_url)
        while temp.text != 'true':
            self.count += 1
            print("第{}次识别出错。".format(self.count))
            temp = code_verify(self.img_url, self.code_verify_url)

        data = {
            'username': self.username,
            'password': self.password,
            'target': url,
            'errorPaw': self.errorPaw
        }

        yield FormRequest(url=self.login_succeed_url,
                          formdata=data,
                          callback=self.parse,
                          meta={'date': date})
Exemplo n.º 3
0
 def start_requests(self):
     response = parsel.Selector(
         requests.post(self.search_url, {'pageNo': '12', 'keyword': '国内丙烯腈厂家一周产量统计'}).text)
     lists = response.css('div.zixun.contentactive > ul.contentList > li')
     temp = code_verify(self.img_url, self.code_verify_url)
     while temp.text != 'true':
         self.count += 1
         print("第{}次识别出错。".format(self.count))
         temp = code_verify(self.img_url, self.code_verify_url)
     for li in lists:
         time.sleep(5)
         url = li.css('h2 > a::attr(href)').get()
         date = li.css('span.date::text').get()
         data = {
             'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw
         }
         yield FormRequest(
             url=self.login_succeed_url,
             formdata=data,
             callback=self.parse,
             meta={'date': date},
             headers={
                 'Referer': self.search_url
             }
         )
Exemplo n.º 4
0
    def before_parse(self, response):
        lists = response.css('div.zixun.contentactive > ul.contentList > li')

        temp = code_verify(self.img_url, self.code_verify_url)
        while temp.text != 'true':
            self.count += 1
            print("第{}次识别出错。".format(self.count))
            temp = code_verify(self.img_url, self.code_verify_url)

        for li in lists:
            time.sleep(5)
            url = li.css('h2 > a::attr(href)').get()
            date = li.css('span.date::text').get()
            # 今日的日期会标记为红色,dom路径有所不同
            if date is None:
                date = response.xpath(
                    "//ul[@class='contentList']/li[1]//span[@class='date']/font/text()"
                ).get()
            data = {
                'username': self.username,
                'password': self.password,
                'target': url,
                'errorPaw': self.errorPaw
            }
            yield FormRequest(url=self.login_succeed_url,
                              formdata=data,
                              callback=self.parse,
                              meta={'date': date},
                              headers={'Referer': self.search_url})
Exemplo n.º 5
0
 def getCode(self, response):
     # # 该方法识别正确率极低
     # open('./code.png', 'wb').write(response.body)
     # # urlretrieve(self.url_code, './code.png')
     # image = Image.open('./code.png')
     # content = pytesseract.image_to_string(image)
     # # 手动输入测试
     # # image = Image.open(BytesIO(response.body))
     # # image.show()
     # # content = input("请输入验证码:")
     # return [
     #     Request(
     #         url=self.url_check_code + "?code=" + content,
     #         callback=self.checkImgCode,
     #         dont_filter=True,
     #         headers=self.get_headers(host=self.host_login_oilchem)
     #     )
     # ]
     temp = code_verify(self.url_code, self.code_verify_url)
     while temp.text != 'true':
         self.count += 1
         print("第{}次识别出错。".format(self.count))
         time.sleep(self.sleep_time)
         temp = code_verify(self.url_code, self.code_verify_url)
     form_data = {
         "username": "******",
         "password": "******"
     }
     return [
         FormRequest(url=self.url_login,
                     callback=self.afterLogin,
                     formdata=form_data,
                     dont_filter=True,
                     headers=self.get_headers(self.host_login_oilchem))
     ]
Exemplo n.º 6
0
    def start_requests(self):

        # 获取最大页数
        res = requests.post(self.search_url, {
            'pageNo': '1',
            'keyword': '[PP粒]:中油华南PP'
        }).text
        maxPageNo = re.findall(r"第1页/共 (\d+)页", res)[0]

        temp = code_verify(self.img_url, self.code_verify_url)
        while temp.text != 'true':
            self.count += 1
            print("第{}次识别出错。".format(self.count))
            temp = code_verify(self.img_url, self.code_verify_url)

        # 每页爬取
        for pageNo in range(1, int(maxPageNo) + 1):
            response = parsel.Selector(
                requests.post(self.search_url, {
                    'pageNo': str(pageNo),
                    'keyword': '[PP粒]:中油华南PP'
                }).text)
            items = response.xpath("//ul[@class='contentList']/li").getall()

            for item in items:
                time.sleep(2)
                url = parsel.Selector(item).xpath("//h2//a/@href").get()
                date = parsel.Selector(item).xpath(
                    "//span[@class='date']//text()").get()
                data = {
                    'username': self.username,
                    'password': self.password,
                    'target': url,
                    'errorPaw': self.errorPaw
                }
                if date:
                    yield FormRequest(url=self.login_succeed_url,
                                      formdata=data,
                                      callback=self.parse,
                                      meta={'date': date})
Exemplo n.º 7
0
    def before_parse(self, response):
        lists = response.css('div.zixun.contentactive > ul.contentList > li')
        temp = code_verify(self.img_url, self.code_verify_url)
        while temp.text != 'true':
            self.count += 1
            print("第{}次识别出错。".format(self.count))
            temp = code_verify(self.img_url, self.code_verify_url)

        for li in lists:
            time.sleep(5)
            url = li.css('h2 > a::attr(href)').get()
            date = li.css('span.date::text').get()
            # 今日的日期会标记为红色,dom路径有所不同
            if date is None:
                date = response.xpath("//ul[@class='contentList']/li[1]//span[@class='date']/font/text()").get()
            data = {
                'username': self.username, 'password': self.password, 'target': url, 'errorPaw': self.errorPaw
            }
            yield FormRequest(
                url=self.login_succeed_url,
                formdata=data,
                callback=self.parse,
                meta={'date': date},
                headers={
                    'Referer': self.search_url
                }
            )

        time.sleep(120)
        next_url = response.css('#simpledatatable_paginate > ul > li:nth-last-child(2) > a::attr(href)').get()
        next_page = re.search(r"(?<=goPage)\((\d)\)", next_url).group(1)
        if next_page is not None:
            yield FormRequest(
                url=self.search_url,
                formdata={
                    'pageNo': next_page,
                    'keyword': '库存早报'
                },
                callback=self.before_parse
            )
Exemplo n.º 8
0
 def getCode(self, response):
     temp = code_verify(self.url_code, self.code_verify_url)
     count = 0
     while temp.text != 'true':
         count += 1
         print("第{}次识别出错。".format(count))
         time.sleep(0.5)
         temp = code_verify(self.url_code, self.code_verify_url)
     form_data = {
         "username": "******",
         "password": "******",
         'errorPaw': "deya1589",
     }
     return [
         FormRequest(
             url=self.url_login,
             callback=self.afterLogin,
             formdata=form_data,
             dont_filter=True,
             headers=self.get_headers(self.host_login_oilchem)
         )
     ]
Exemplo n.º 9
0
    def start_requests(self):
        try:
            res = parsel.Selector(
                requests.post(
                    "https://search.oilchem.net/solrSearch/select.htm",
                    headers={
                        'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
                    },
                    data={
                        'pageNo': 1,
                        'keyword': '全国PP装置生产情况汇总'
                    }).text)

            if self.mode == 1:
                urls = []
                last_page = res.xpath('//*[@id="simpledatatable_info"]/text()'
                                      ).extract_first()[6:8]
                for i in range(1, int(last_page) + 1):
                    res = parsel.Selector(
                        requests.post(
                            "https://search.oilchem.net/solrSearch/select.htm",
                            headers={
                                'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
                            },
                            data={
                                'pageNo': i,
                                'keyword': '全国PP装置生产情况汇总'
                            }).text)
                    temp_urls = list(
                        set(
                            res.xpath(
                                '/html/body/div/div[3]/div[1]/div[2]/ul/li//a/@href'
                            ).extract()))
                    time.sleep(5 + random.uniform(1, 10))
                    for tu in temp_urls:
                        urls.append(tu)
                print("共爬取 " + str(len(urls)) + ' 条url')
                for target in urls:
                    # 识别错误次数
                    count = 0
                    if target[0:4] != 'http':
                        target = 'https:' + target
                    temp = code_verify(self.img_url, self.code_verify_url)
                    while temp.text != 'true':
                        count += 1
                        print("第{}次识别出错。".format(count))
                        temp = code_verify(self.img_url, self.code_verify_url)
                    yield FormRequest(url=self.login_succeed_url,
                                      formdata={
                                          'username': self.username,
                                          'password': self.password,
                                          'target': target,
                                          'errorPaw': self.errorPaw
                                      },
                                      callback=self.parse)
                    time.sleep(10 + int(random.uniform(1, 10)))
            else:
                url = res.xpath(
                    '/html/body/div/div[3]/div[1]/div[2]/ul/li[1]/h2/a/@href'
                ).extract_first()
                date = res.xpath(
                    '/html/body/div/div[3]/div[1]/div[2]/ul/li[1]/div/div/span/span/text()'
                ).extract_first()
                today = datetime.date.today().strftime('%Y-%m-%d')
                if today != date:
                    print("当日全国PP数据还没有出来")
                    return
                else:
                    count = 0
                    if url[0:4] != 'http':
                        url = 'https:' + url
                    temp = code_verify(self.img_url, self.code_verify_url)
                    while temp.text != 'true':
                        count += 1
                        print("第{}次识别出错。".format(count))
                        temp = code_verify(self.img_url, self.code_verify_url)
                    yield FormRequest(url=self.login_succeed_url,
                                      formdata={
                                          'username': self.username,
                                          'password': self.password,
                                          'target': url,
                                          'errorPaw': self.errorPaw
                                      },
                                      callback=self.parse)

        except Exception as e:
            print('!' * 30)
            print('step1')
            print(e)
            print('!' * 30)