def get_query_request(self, response): intime = self.intime if intime == '全部时间': return super(SogouBbsSpider, self).get_query_request(response) # noinspection PyPropertyAccess br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) br.select_form(nr=self.search_form_order) query = response.meta['query'] encoding = response.encoding query = query.encode(encoding) search_input_name = self.search_input_name.encode(encoding) br[search_input_name] = query br.submit() intime = intime.encode('utf8') query_request = br.click_link(text=intime) scrapy_request = request_mechanize2scrapy(query_request) scrapy_request.callback = self.query_callback url = scrapy_request.url query = get_url_query(url) query['num'] = 100 new_url = change_url_query(url, query) new_request = scrapy_request.replace(url=new_url) return new_request
def get_next_page_request(self, response): # noinspection PyPropertyAccess br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) default_encoding = self._site_default_encoding encoding = default_encoding if default_encoding else response.encoding next_page_word = self.next_page_word.encode(encoding) next_page_link = self.get_next_page_link() # noinspection PyUnusedLocal try: if next_page_link: next_page_request = br.click_link(link=next_page_link) else: next_page_request = br.click_link(text=next_page_word) scrapy_request = request_mechanize2scrapy(next_page_request) scrapy_request.callback = self.query_callback return scrapy_request except LinkNotFoundError as e: return None except Exception as e: #到这里一般是解析到的链接是js的. #已关键字监控这条日志 log.msg('spider turn page error:%s' % str(e), level=log.INFO) return None
def get_query_request(self, response): # noinspection PyPropertyAccess br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) br.select_form(nr=self.search_form_order) query = response.meta['query'] default_encoding = self._site_default_encoding encoding = default_encoding if default_encoding else response.encoding query = query.encode(encoding) search_input_name = self.search_input_name.encode(encoding) br[search_input_name] = query query_request = br.click() scrapy_request = request_mechanize2scrapy(query_request) scrapy_request.callback = self.query_callback return scrapy_request
def get_query_request(self, response): """ 填表单,构造相应请求 """ # noinspection PyPropertyAccess br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) br.select_form(nr=self.search_form_order) query = response.meta['query'] encoding = response.encoding query = query.encode(encoding) search_input_name = self.search_input_name.encode(encoding) br[search_input_name] = query br[b'rn'] = [self.item_count_per_page] activate_controls(br.form) if not self.begin_date and not self.end_date: br[b's'] = [b'1'] br[b'begin_date'] = b'' br[b'end_date'] = b'' else: br[b's'] = [b'2'] br[b'begin_date'] = self.begin_date.encode(encoding) br[b'end_date'] = self.end_date.encode(encoding) y0, m0, d0 = self.begin_date.split('-') y1, m1, d1 = self.end_date.split('-') br[b'y0'] = y0.encode(encoding) br[b'm0'] = m0.encode(encoding) br[b'd0'] = d0.encode(encoding) br[b'y1'] = y1.encode(encoding) br[b'm1'] = m1.encode(encoding) br[b'd1'] = d1.encode(encoding) br[b'bt'] = str(int(time.mktime(time.strptime(self.begin_date, '%Y-%m-%d')))) br[b'et'] = str(int(time.mktime(time.strptime(self.end_date, '%Y-%m-%d')))) query_request = br.click() scrapy_request = request_mechanize2scrapy(query_request) scrapy_request.callback = self.query_callback return scrapy_request