示例#1
0
 def get_media_requests(self, item, info):
     item['proxy'] = VSpider.proxy
     yield Request(item['src'], meta=item)
示例#2
0
 def parse(self, response, **kwargs):
     for index in range(16):
         yield Request('https://httpstat.us/200',
                       callback=self.parse_first_callback,
                       dont_filter=True,
                       cb_kwargs={'index': index})
 def start_requests(self):  #初始请求
     url = "https://yz.chsi.com.cn/sch/"
     yield Request(url)
示例#4
0
    def process_spider_output(self, response, result, spider):
        def get_dict(string):
            if string is None:
                return None
            try:
                d = {}
                arr = string.split(';;')
                for a in arr:
                    infos = a.split(',,')
                    d[infos[1]] = infos[0]
                return d
            except:
                return None

        def get_infos(string):
            d = get_dict(string)
            try:
                values = []
                for key, value in d.items():
                    values.append(key)
                return ','.join(values)
            except:
                return ''

        if not (200 <= response.status < 300):  # common case
            return result if result else []
        if response.meta.get('PageType') != 'ProjectInfo':
            return result if result else []

        result = list(result)
        # print('ProjectInfoHandleMiddleware')
        projectInfoItem = ProjectInfoItem()
        projectInfoItem['SourceUrl'] = response.url
        projectInfoItem['ProjectUUID'] = uuid.uuid3(uuid.NAMESPACE_DNS,
                                                    response.url)
        projectInfoItem['ProjectNO'] = response.xpath(
            '//*[@id="PROJECT_XMBH"]/text()').extract_first()
        projectInfoItem['ProjectName'] = response.xpath(
            '//*[@id="PROJECT_XMMC_1"]/text()').extract_first()
        projectInfoItem['Developer'] = response.xpath(
            '//*[@id="PROJECT_KFQY_NAME"]/text()').extract_first()
        projectInfoItem['ProjectAddress'] = response.xpath(
            '//*[@id="PROJECT_XMDZ"]/text()').extract_first()
        projectInfoItem['DistrictName'] = response.xpath(
            '//*[@id="PROJECT_SZQY"]/text()').extract_first()
        projectInfoItem['FloorAreaRatio'] = response.xpath(
            '//*[@id="PROJECT_RJL"]/text()').extract_first()
        projectInfoItem['TotalBuidlingArea'] = response.xpath(
            '//*[@id="PROJECT_GHZJZMJ"]/text()').extract_first()
        projectInfoItem['PlanningAcceptanceDate'] = response.xpath(
            '//*[@id="PROJECT_GHYSRQ"]/text()').extract_first()
        projectInfoItem['ComprehensiveAcceptanceDate'] = response.xpath(
            '//*[@id="PROJECT_ZHYSRQ"]/text()').extract_first()
        projectInfoItem['PlanInvest'] = response.xpath(
            '//*[@id="PROJECT_JHZTZ"]/text()').extract_first()
        projectInfoItem['PresalePermitNumber'] = response.xpath(
            '//*[@id="YSXKZH"]/text()').extract_first()
        projectInfoItem['SoldAmount'] = response.xpath(
            '//*[@id="YSZTS"]/text()').extract_first()
        projectInfoItem['UnsoldAmount'] = response.xpath(
            '//*[@id="WSZTS"]/text()').extract_first()
        projectInfoItem['SoldArea'] = response.xpath(
            '//*[@id="YSZMJ"]/text()').extract_first()
        projectInfoItem['UnsoldArea'] = response.xpath(
            '//*[@id="WSZMJ"]/text()').extract_first()
        projectInfoItem['CheckOutAmount'] = response.xpath(
            '//*[@id="TFCS"]/text()').extract_first()
        projectInfoItem['CheckOutRatio'] = response.xpath(
            '//*[@id="TFL"]/text()').extract_first()
        projectInfoItem['SellAddress'] = response.xpath(
            '//*[@id="PROJECT_SLCDH"]/text()').extract_first()
        projectInfoItem['Selltel'] = response.xpath(
            '//*[@id="PROJECT_SLDH"]/text()').extract_first()
        projectInfoItem['DesignUnit'] = response.xpath(
            '//*[@id="PROJECT_SJDW"]/text()').extract_first()
        projectInfoItem['ConstructionUnit'] = response.xpath(
            '//*[@id="PROJECT_SGDW"]/text()').extract_first()
        projectInfoItem['SupervisionUnit'] = response.xpath(
            '//*[@id="PROJECT_JLDW"]/text()').extract_first()
        projectInfoItem['ManagementCompany'] = response.xpath(
            '//*[@id="PROJECT_WYGLGS"]/text()').extract_first()
        projectInfoItem['ProjectSupporting'] = response.xpath(
            '//*[@id="PROJECT_XMPT"]/text()').extract_first()
        projectInfoItem['AroundSupporting'] = response.xpath(
            '//*[@id="PROJECT_ZBPT"]/text()').extract_first()
        projectInfoItem['ProjectIntro'] = response.xpath(
            '//*[@id="PROJECT_XMJL"]/text()').extract_first()

        projectInfoItem['TodayHousingSoldAmount'] = response.xpath(
            '//*[@id="ZZ_JRYSTS"]/text()').extract_first()
        projectInfoItem['TodayHousingSoldArea'] = response.xpath(
            '//*[@id="ZZ_JRYSMJ"]/text()').extract_first()
        projectInfoItem['HousingSoldAmount'] = response.xpath(
            '//*[@id="ZZ_LJYSTS"]/text()').extract_first()
        projectInfoItem['HousingSoldArea'] = response.xpath(
            '//*[@id="ZZ_LJYSMJ"]/text()').extract_first()
        projectInfoItem['HousingUnsoldAmount'] = response.xpath(
            '//*[@id="ZZ_WSTS"]/text()').extract_first()
        projectInfoItem['HousingUnsoldArea'] = response.xpath(
            '//*[@id="ZZ_WSMJ"]/text()').extract_first()

        projectInfoItem['TodayShopSoldAmount'] = response.xpath(
            '//*[@id="SY_JRYSTS"]/text()').extract_first()
        projectInfoItem['TodayShopSoldArea'] = response.xpath(
            '//*[@id="SY_JRYSMJ"]/text()').extract_first()
        projectInfoItem['ShopSoldAmount'] = response.xpath(
            '//*[@id="SY_LJYSTS"]/text()').extract_first()
        projectInfoItem['ShopSoldArea'] = response.xpath(
            '//*[@id="SY_LJYSMJ"]/text()').extract_first()
        projectInfoItem['ShopUnsoldAmount'] = response.xpath(
            '//*[@id="SY_WSTS"]/text()').extract_first()
        projectInfoItem['ShopUnsoldArea'] = response.xpath(
            '//*[@id="SY_WSMJ"]/text()').extract_first()

        projectInfoItem['TodayOfficeSoldAmount'] = response.xpath(
            '//*[@id="BG_JRYSTS"]/text()').extract_first()
        projectInfoItem['TodayOfficeSoldArea'] = response.xpath(
            '//*[@id="BG_JRYSMJ"]/text()').extract_first()
        projectInfoItem['OfficeSoldAmount'] = response.xpath(
            '//*[@id="BG_LJYSTS"]/text()').extract_first()
        projectInfoItem['OfficeSoldArea'] = response.xpath(
            '//*[@id="BG_LJYSMJ"]/text()').extract_first()
        projectInfoItem['OfficeUnsoldAmount'] = response.xpath(
            '//*[@id="BG_WSTS"]/text()').extract_first()
        projectInfoItem['OfficeUnsoldArea'] = response.xpath(
            '//*[@id="BG_WSMJ"]/text()').extract_first()

        projectInfoItem['TodayOtherSoldAmount'] = response.xpath(
            '//*[@id="QT_JRYSTS"]/text()').extract_first()
        projectInfoItem['TodayOtherSoldArea'] = response.xpath(
            '//*[@id="QT_JRYSMJ"]/text()').extract_first()
        projectInfoItem['OtherSoldAmount'] = response.xpath(
            '//*[@id="QT_LJYSTS"]/text()').extract_first()
        projectInfoItem['OtherSoldArea'] = response.xpath(
            '//*[@id="QT_LJYSMJ"]/text()').extract_first()
        projectInfoItem['OtherUnsoldAmount'] = response.xpath(
            '//*[@id="QT_WSTS"]/text()').extract_first()
        projectInfoItem['OtherUnsoldArea'] = response.xpath(
            '//*[@id="QT_WSMJ"]/text()').extract_first()

        # 土地证
        tdzInfo = response.xpath('//*[@id="tdzInfo"]/@value').extract_first()
        projectInfoItem['CertificateOfUseOfStateOwnedLand'] = get_infos(
            tdzInfo)
        # 施工许可证
        sgxkzInfo = response.xpath(
            '//*[@id="sgxkzInfo"]/@value').extract_first()
        projectInfoItem['ConstructionPermitNumber'] = get_infos(sgxkzInfo)
        # 用地规划许可证
        jsydghxkzInfo = response.xpath(
            '//*[@id="ghxkzInfo"]/@value').extract_first()
        projectInfoItem['LandUsePermit'] = get_infos(jsydghxkzInfo)
        # 工程规划许可证
        ghxkzInfo = response.xpath(
            '//*[@id="ghxkzInfo"]/@value').extract_first()
        projectInfoItem['BuildingPermit'] = get_infos(ghxkzInfo)

        result.append(projectInfoItem)

        # 预售证信息
        presellInfo = response.xpath(
            '//*[@id="presellInfo"]/@value').extract_first()
        presellInfo_dict = get_dict(presellInfo)
        if presellInfo_dict:
            for key, value in presellInfo_dict.items():
                if value:
                    url = 'http://www.ytfcjy.com/public/project/presellCertInfo.aspx?code={code}'.format(
                        code=value)
                    presell_info_req = Request(
                        url=url,
                        headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
                        dont_filter=True,
                        meta={
                            'PageType': 'PresellInfo',
                            'ProjectUUID': str(projectInfoItem['ProjectUUID']),
                            'ProjectName': projectInfoItem['ProjectName'],
                        })
                    result.append(presell_info_req)
        # 楼栋信息
        buildingInfo = response.xpath(
            '//*[@id="buildInfo"]/@value').extract_first()
        buildingInfo_dict = get_dict(buildingInfo)
        if buildingInfo_dict:
            for key, value in buildingInfo_dict.items():
                if value:
                    # 销控表的列表访问链接
                    url = 'http://www.ytfcjy.com/public/project/RoomList.aspx?code={code}&rsr=1001&rse=0&jzmj=&tnmj='.format(
                        code=value)
                    buildingInfoItem = BuildingInfoItem()
                    buildingInfoItem['SourceUrl'] = url
                    buildingInfoItem['ProjectUUID'] = projectInfoItem[
                        'ProjectUUID']
                    buildingInfoItem['ProjectName'] = projectInfoItem[
                        'ProjectName']
                    buildingInfoItem['BuildingUUID'] = uuid.uuid3(
                        uuid.NAMESPACE_DNS, url)
                    buildingInfoItem['BuildingName'] = key[:key.index('(')]
                    buildingInfoItem['BuildingID'] = value
                    result.append(buildingInfoItem)

                    body = '%3C?xml%20version=%221.0%22%20encoding=%22utf-8%22%20standalone=%22yes%22?%3E%0A%3Cparam%20funname=%22SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx%22%3E%0A%3Citem%3E{BuildingID}%3C/item%3E%0A%3Citem%3E1%3C/item%3E%0A%3Citem%3E1%3C/item%3E%0A%3Citem%3E80%3C/item%3E%0A%3Citem%3E720%3C/item%3E%0A%3Citem%3Eg_oBuildTable%3C/item%3E%0A%3Citem%3E%201=1%3C/item%3E%0A%3C/param%3E%0A'
                    building_info_req = Request(
                        url=
                        'http://www.ytfcjy.com/Common/Agents/ExeFunCommon.aspx?&req={time}'
                        .format(time=round(time.time() * 1000)),
                        headers={
                            'Host': 'www.ytfcjy.com',
                            'Connection': 'keep-alive',
                            'Origin': 'http:/www.ytfcjy.com',
                            'Content-Type': 'text/plain;charset=UTF-8',
                            'Accept': '*/*',
                            'Accept-Encoding': 'gzip, deflate',
                            'Accept-Language': 'zh-CN,zh;q=0.9',
                        },
                        dont_filter=True,
                        method='POST',
                        body=body.format(BuildingID=value),
                        meta={
                            'PageType': 'HouseList',
                            'ProjectUUID': str(projectInfoItem['ProjectUUID']),
                            'ProjectName': projectInfoItem['ProjectName'],
                            'BuildingName': buildingInfoItem['BuildingName'],
                            'BuildingUUID':
                            str(buildingInfoItem['BuildingUUID']),
                        })
                    result.append(building_info_req)
        return result
示例#5
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = GanjiShoprentalItem()

        div = response.css('div.f-list.js-tips-list > div')
        print('parse_list div:{}  response.url: {}'.format(
            div.css('dd.dd-item.title > a::text').extract(), response.url))
        self.logger.debug('parse_list div:{}  response.url: {}'.format(
            div.css('dd.dd-item.title > a::text').extract(), response.url))
        for i in div:
            item['title'] = i.css('dd.dd-item.title > a::text').extract_first()
            u = i.css('dd.dd-item.title > a::attr(href)').extract_first()
            if u.startswith('http'):
                item['url'] = u
                item['number'] = item['url'].split('?')[0].split('/')[-2]
            else:
                item['url'] = 'http://sz.ganji.com' + u
                item['number'] = item['url'].split('?')[0].split('/')[-1]
            if i.css('.unit::text').extract_first():
                item['month_price'] = i.css('.num::text').extract_first(
                ) + i.css('.unit::text').extract_first()
            else:
                item['month_price'] = i.css('.num::text').extract_first()
            item['day_price'] = i.css('.small-price::text').extract_first()
            if i.css('img[data-original]'):
                item['img'] = i.css('img::attr(data-original)').extract_first()
            else:
                item['img'] = i.css('img::attr(src)').extract_first()
            con = i.css('dd.dd-item.size > span::text').extract()
            if len(con) == 3:
                item['area'] = float(
                    re.findall('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0',
                               con[0].strip())[0])
                item['floor'] = con[1].strip()
                item['type'] = con[2].strip()
            # elif len(con) == 2:
            #     item['area'] = float(re.findall('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0', con[0].strip())[0])
            #     item['floor'] = con[1].strip()
            #     item['type'] = con[2].strip()
            else:
                item['area'] = 0
                item['floor'] = ''
                item['type'] = ''
            site = i.css('dd.dd-item.address > span.area > a::text').extract()
            if len(site) == 3:
                item['district'] = site[0].strip()
                item['location'] = site[1].strip().strip(' - ')
                item['address'] = site[2].strip().strip(' - ')
            elif len(site) == 2:
                item['district'] = site[0].strip()
                item['location'] = ''
                item['address'] = site[1].strip().strip(' - ')
            dd = i.css('dd.dd-item.source > span::text').extract()
            if len(dd) == 3:
                item['transfer'] = dd[0].strip()
                item['status'] = dd[1].strip().strip(' - ')
                item['industry'] = dd[2].strip().strip(' - ')
            elif len(dd) == 2:
                item['transfer'] = dd[0].strip()
                item['status'] = dd[1].strip().strip(' - ')
                item['industry'] = ''

            # getlocation(item)
            # yield item
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_details)

        le = LinkExtractor(restrict_css='ul.pageLink a.next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
 def parse(self, response):
     item = ClubCircleDetailScrapyItem()
     yield Request(url=response.url, callback=self.parse_club_circle_detail_items,
                   meta={"item": copy.deepcopy(item)}, dont_filter=True)
示例#7
0
 def start_requests(self):
     for year in range(2007, 2017):
         yield Request(self.BASE_URL(year), meta={'year': year})
示例#8
0
 def parseNews(self, response):
     self.response_body_decode(response)
     sel = Selector(response)
     homeurl = tools.getHomeUrl(response.url)
     brandname = response.meta['brandname']
     news = None  # news保存新闻主体部分的SelectorList
     pagerule = None
     # 判断是否已经可以确定页面规则
     if response.meta.has_key('pagerule'):
         pagerule = response.meta['pagerule']
         news = sel.xpath(pagerule['pageform'])
     else:
         # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取
         for each_rule in newspage_type.page_rules:
             news = sel.xpath(each_rule['pageform'])
             if len(news) > 0:
                 pagerule = each_rule
                 break
     if pagerule is None:
         raise ValueError('Error processing (' + response.url +
                          ') This page do not have corresponding rules')
     # 获得allpage 和 nextpage url
     if pagerule['allpage'] is None:
         allpage = []
     else:
         allpage = news.xpath(pagerule['allpage']).extract()
     if pagerule['nextpage'] is None:
         nextpage = []
     else:
         nextpage = news.xpath(pagerule['nextpage']).extract()
     # 如果包含全页阅读的url,则进行该处理
     if len(allpage) > 0:
         if tools.isCompleteUrl(allpage[0]):
             url = allpage[0]
         else:
             url = homeurl + allpage[0]
         r = Request(url, callback=self.parseNews)
         r.meta['brandname'] = brandname
         r.meta['pagerule'] = pagerule
         yield r
     elif len(nextpage) > 0:
         # 如果包含下一页,则进行该处理
         if tools.isCompleteUrl(nextpage[0]):
             url = nextpage[0]
         else:
             url = homeurl + nextpage[0]
         # 提取当前页面的title, date, content,保存到article中,传递至下一请求
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         article = {
             'brandname': brandname,
             'title': title,
             'date': date,
             'content': content
         }
         r = Request(url, callback=self.parseNextPage)
         r.meta['article'] = article
         r.meta['pagerule'] = pagerule
         yield r
     else:
         # 如果新闻只有一页,则直接提取新闻内容
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         item = NewsItem()
         item['brandname'] = brandname
         item['date'] = date
         item['title'] = "".join(title)
         item['content'] = "".join(content)
         yield item
示例#9
0
 def get_novel_url(self, response):
     novel_urls = response.xpath(
         '//*[@id="results"]/div[3]/div/div[2]/h3/a/@href').extract()
     for i in novel_urls:
         yield Request(i, callback=self.parse_novel_link)
示例#10
0
 def start_requests(self):
     yield Request(url=self.START_URL,
                   callback=self.parse_page,
                   dont_filter=True,
                   headers=self.headers
                   )
示例#11
0
    def parse_list(self, response):
        pageNo = response.meta['pageNo']
        list = response.xpath('//*[@id="data_list"]/tr')
        if len(list) == 1 and response.xpath(
                '//*[@id="data_list"]/tr/td/text()').extract_first(
                ) == u'没有可显示资源':
            self.logger.info('---%s,%s---' % (response.url, u'没有数据'))
            return
        self.logger.info('---pageNo:%s,%s---' % (pageNo, response.url))
        record_not_exist = True
        for item in list:
            # record_not_exist = True
            if len(item.xpath('td')) == 1:
                continue
            movie = MovieItem()
            movie['site'] = self.site
            movie['type'] = item.xpath('td[2]/a/text()').extract_first()
            movie['full_name'] = item.xpath(
                'td[3]/a/text()').extract_first().replace('\r', '').replace(
                    '\n', '').strip()
            movie['name'] = movie['full_name'].split('.')[0]
            movie['total'] = item.xpath('td[4]/text()').extract_first()
            movie['page_url'] = self.base + item.xpath(
                'td[3]/a/@href').extract_first()
            movie['id'] = '%s_%s' % (self.site,
                                     re.search('\?hash=(\w+)',
                                               movie['page_url']).group(1))

            if item.xpath('td[8]/a/text()').extract_first() != u'高清MP4吧':
                print item.xpath('td[8]/a/text()').extract_first()
            if self.blogs.isExistsMoviesByid(movie['id']):
                # self.logger.info(
                #     '*****************type:%s,pageNo:%s,record exist! crawl total count:%s,title%s*****************' % (
                #         movie['type'], pageNo, self.count, movie['full_name']))
                record_not_exist = False
                continue
            else:
                self.logger.info('------lastest file,type:%s,title%s-------' %
                                 (movie['type'], movie['full_name']))
                yield Request(
                    movie['page_url'],
                    meta={
                        'type': 'detail',
                        'movie': movie,
                        'download_path': response.meta['download_path']
                    },
                    dont_filter=True,
                    headers={
                        'User-Agent':
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                    })
        next_page = response.xpath(
            '//*[@class="pages clear"]/a[@class="nextprev"]/@href').extract()
        if next_page and record_not_exist:
            next_page_no = int(re.search('page=(\d+)', next_page[-1]).group(1))
            if next_page_no > pageNo:
                yield Request(
                    "http://www.mp4ba.com/%s" % next_page[-1],
                    meta={
                        'type': 'list',
                        'pageNo': next_page_no,
                        'download_path': response.meta['download_path']
                    },
                    dont_filter=True,
                    headers={
                        'User-Agent':
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                    })
示例#12
0
 def parse(self, response):
     for page_index in range(1, 500):
         yield Request(url='https://www.luoqiu.com/top/lastupdate_' +
                       str(page_index) + '.html',
                       callback=self.parse_page)
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url, headers=self.headers)
示例#14
0
    def start_requests(self):
        def mk_url_headers_body(page, key):
            def quote_val(url):
                return re.sub(
                    r'([\?&][^=&]*=)([^&]*)',
                    lambda i: i.group(1) + quote(unquote(i.group(2),
                                                         encoding='utf-8'),
                                                 encoding='utf-8'), url)

            url = (
                'https://www.google.com/_/VisualFrontendUi/data/batchexecute'
                '?rpcids=HoAMBc'
                '&f.sid=1484552913048631198'
                '&bl=boq_visualfrontendserver_20200214.01_p1'
                '&hl=en-US'
                '&authuser'
                '&soc-app=162'
                '&soc-platform=1'
                '&soc-device=1'
                '&_reqid=142196'
                '&rt=c')
            url = quote_val(url)
            headers = {
                "accept":
                "*/*",
                "accept-encoding":
                "gzip, deflate, ",  # auto delete br encoding. cos requests and scrapy can not decode it.
                "accept-language":
                "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
                "content-type":
                "application/x-www-form-urlencoded;charset=UTF-8",
                "cookie":
                ("DV=MzB0UiguEPQtwDWaKe0v0OimEEJmBddGUz1EEumeBAIAAAA; "
                 "NID=198=SU1N0X0TlV1FfnWH1NVNJ2OmHzx7hnM2Bb1Iwgkh-7h6wpDFwIsf8DK1vZOwU7G4ZaGc6bGca2ZsdoMbB8uhfezMREiX9T53Ldv0GOq-KXT3q9Z4Y18rDl5Coes2SoHfd69mtDk7XmmtFgi0z0s8Zh-GyRA02IPjbrMqZrSnv2k; "
                 "OTZ=5328196_24_24__24_; "
                 "1P_JAR=2020-02-18-03"),
                "origin":
                "https://www.google.com",
                "referer":
                "https://www.google.com/",
                "sec-fetch-mode":
                "cors",
                "sec-fetch-site":
                "same-origin",
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
                "x-client-data":
                "CLO1yQEIhrbJAQiitskBCMG2yQEIqZ3KAQjiqMoBCMuuygEIyq/KAQjOsMoBCPe0ygEIjrrKAQ==",
                "x-goog-ext-190139975-jspb":
                "[\"ZZ\"]",
                "x-same-domain":
                "1"
            }
            body = {
                "f.req":
                "[[[\"HoAMBc\",\"[null,null,[" + str(page) +
                ",null,450,1,1280,[[\\\"m_jkppg-2ek_iM\\\",200,200,-2147352321]],[],[],null,null,null,534],\\\"\\\",\\\"\\\",null,null,null,null,null,null,\\\"\\\",null,null,null,null,null,null,null,null,null,null,null,null,\\\"\\\",null,null,null,[\\\""
                + key +
                "\\\",\\\"\\\",null,\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",null,null,\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\"]]\",null,\"generic\"]]]"
            }
            return url, headers, body

        keys = ['狗']
        for key in keys:
            for page in range(1, 80):
                url, headers, body = mk_url_headers_body(page, key)
                meta = {}
                meta['key'] = key
                meta['proxy'] = self.proxy
                r = Request(
                    url,
                    method='POST',
                    headers=headers,
                    body=urlencode(body),
                    callback=self.parse,
                    meta=meta,
                )
                yield r
                break
            break
示例#15
0
    def process_spider_output(self, response, result, spider):

        out_come = cheack_response(pagetype=['bd_url', 'bd_url2'],
                                   response=response,
                                   result=result)

        outcome_list, pagetype = [], response.meta['PageType']

        if (out_come == 'right') and (pagetype == 'bd_url'):

            cheack_key = response.xpath(
                '//*[@id="house"]/div[3]/dl/dd/table/tr[2]/td/font/text()'
            ).extract_first()

            record_dict = response.meta['Record_Data']

            if cheack_key != '暂无数据!':

                BuildingItem = Building_Detail_Item()

                item_cd = Certificate_Detail_Item()

                BuildingItem['ProjectName'] = record_dict['ProjectName']

                BuildingItem['ProjectUUID'] = record_dict['ProjectUUID']

                item_cd['ProjectName'] = record_dict['ProjectName']

                item_cd['ProjectUUID'] = record_dict['ProjectUUID']

                content = response.xpath(
                    '//*[@id="house"]/div[3]/dl/dd/table/tr')

                for i in content[1:]:

                    item_cd['PresalePermitUrl'] = clean_rule1(
                        i.xpath('./td[4]/a/@href').extract_first())

                    item_cd['PresalePermitNumber'] = clean_rule1(
                        i.xpath('./td[4]/a/text()').extract_first())

                    BuildingItem['PresalePermitNumber'] = item_cd[
                        'PresalePermitNumber']
                    BuildingItem['SourceUrl'] = response.url

                    if item_cd['PresalePermitNumber'] != '':
                        item_cd['PresalePermitNumberUUID'] = str(
                            uuid.uuid3(uuid.NAMESPACE_DNS,
                                       item_cd['PresalePermitNumber']))
                        BuildingItem['PresalePermitNumberUUID'] = item_cd[
                            'PresalePermitNumberUUID']

                        cheack_key2 = re.findall(r'ysz_id',
                                                 item_cd['PresalePermitUrl'])
                        if cheack_key2:

                            item_cd[
                                'PresalePermitUrl'] = 'http://newhouse.fz0752.com' + item_cd[
                                    'PresalePermitUrl']

                            re_get2 = Request(url=item_cd['PresalePermitUrl'],
                                              method='GET',
                                              headers=self.headers,
                                              meta={
                                                  'PageType': 'cd_url',
                                                  "item": item_cd
                                              },
                                              dont_filter=True)

                        else:
                            re_get2 = Request(url=item_cd['PresalePermitUrl'],
                                              method='GET',
                                              headers=self.headers,
                                              meta={
                                                  'PageType': 'cd_url',
                                                  "item": item_cd
                                              },
                                              dont_filter=True)
                        outcome_list.append(re_get2)
                    else:
                        BuildingItem['PresalePermitNumberUUID'] = ''

                    BuildingItem['BuildingName'] = clean_rule1(
                        i.xpath('./td[2]/span/text()').extract_first())

                    BuildingItem['BuildingNumber'] = clean_rule1(
                        i.xpath('./td[3]/text()').extract_first())

                    BuildingItem['BuildingUrl'] = clean_rule1(
                        i.xpath('./td[8]/a/@href').extract_first())
                    # 当备案的链接为空时,选用楼盘链接
                    if BuildingItem['BuildingUrl'] == '':
                        BuildingItem[
                            'BuildingUrl'] = 'http://data.fz0752.com' + clean_rule1(
                                i.xpath('./td[7]/a/@href').extract_first())

                    BuildingItem['BuildingUUID'] = uuid.uuid3(
                        uuid.NAMESPACE_DNS, BuildingItem['ProjectName'] +
                        BuildingItem['BuildingName'] +
                        BuildingItem['BuildingNumber'] +
                        BuildingItem['PresalePermitNumber'])
                    BuildingItem['SourceUrl'] = response.url
                    outcome_list.append(BuildingItem)

                now_page = response.xpath(
                    '//*[@id="house"]/div[4]/div/span/text()').extract_first()

                next_page = int(now_page) + 1 if now_page else 0

                cheack_page = response.xpath(
                    '//*[@id="house"]/div[4]/div/a[contains(@href,"javascript:goPage({0})")]/@href'
                    .format(next_page)).extract()

                if cheack_page:
                    url_next = "http://newhouse.fz0752.com/project/selist.shtml?num={0}&old=&pageNO={1}".format(
                        record_dict['ProjectID'], next_page)

                    # print(url_next)

                    re_get = Request(url=url_next,
                                     method='GET',
                                     headers=self.headers,
                                     meta={
                                         'PageType': 'bd_url',
                                         "Record_Data": record_dict
                                     },
                                     dont_filter=True)

                    outcome_list.append(re_get)

            return outcome_list

        else:
            # print('BuildingDetailMiddlewareover')
            return out_come
示例#16
0
 def start_requests(self):
     for i in range(5):
         start_url = 'http://zhannei.baidu.com/cse/search?q=&p={0}&s=920895234054625192&srt=def&nsid=0&entry=1'.format(
             i)
         yield Request(start_url, callback=self.get_novel_url)
示例#17
0
    def process_spider_output(self, response, result, spider):

        out_come = cheack_response(pagetype=['pl_url'],
                                   response=response,
                                   result=result)

        if out_come == 'right':

            outcome_list, pagetype = [], response.meta['PageType']

            print('ProjectGetFidMiddleware')

            if pagetype == 'pl_url':

                content = response.xpath('//*[@id="searchResult"]/dd/div')

                now_page = int(
                    response.xpath(
                        '//*[@id="searchResult"]/dt[2]/h2/div/text()').
                    extract_first())

                crawler_page = response.xpath(
                    '//*[@id="searchResult"]/dt[2]/h2/a[last()]/text()').re(
                        r'\d+')[0]
                crawler_page = int(crawler_page)

                # 判断是否第一页,第一页进行翻页
                if now_page == 1:
                    for page in range(2, crawler_page + 1):
                        next_url = 'http://newhouse.fz0752.com/project/list.shtml?state=&key=&qy=&area=&danjia=&func=&fea=&type=&kp=&mj=&pageNO={0}'.format(
                            page)

                        list_req = Request(url=next_url,
                                           method='GET',
                                           headers=self.headers,
                                           meta={'PageType': 'pl_url'},
                                           dont_filter=True)

                        outcome_list.append(list_req)

                # 获取每页列表上的项目
                for i in content:
                    item_pd = Project_Detail_Item()

                    ProjectName = clean_rule1(
                        i.xpath('./h1/span[1]/a/text()').extract_first())

                    ProjectUUID = uuid.uuid3(uuid.NAMESPACE_DNS, ProjectName)

                    ProjectUrl = clean_rule1(
                        i.xpath('./h1/span[1]/a/@href').extract_first())

                    Developer = clean_rule1(
                        i.xpath('./h2[2]/a/text()').extract_first())

                    phone = clean_rule1(
                        i.xpath('./h4/span/text()').extract_first())

                    if ProjectUrl:
                        item_pd['ProjectName'] = ProjectName

                        item_pd['ProjectUUID'] = ProjectUUID

                        item_pd['ProjectUrl'] = ProjectUrl

                        item_pd['Developer'] = Developer

                        item_pd['SaleTelphoneNumber'] = phone

                        re_get = Request(url=item_pd['ProjectUrl'],
                                         method='GET',
                                         headers=self.headers,
                                         meta={
                                             'PageType': 'pd_url',
                                             "item": item_pd
                                         },
                                         dont_filter=True)

                        outcome_list.append(re_get)
                # print(now_page, crawler_page)
                return outcome_list
        else:
            return out_come
示例#18
0
 def get_media_requests(self, item, info):
     yield Request(item['url'])
示例#19
0
            log.warning("%s is not a news item" % response.url)
            return

        date_elem = soup.find('p', class_='noticia-data')
        if not hasattr(date_elem, "parent"):
            log.warning("Skipped %s" % response.url)
            return
        content_elem = date_elem.parent
        item['datetime'] = make_aware(
            datetime.strptime(date_elem.text.strip(), '%d-%m-%Y'))
        content = str(content_elem.find('div', class_="noticia-corpo"))
        content = mdconverter.handle(content).strip()
        item['content'] = content
        item['html'] = response.text
        item['source'] = response.url
        img_elem = content_elem.find('img', class_="imagem-noticia")
        if img_elem is None:
            yield item
        else:
            yield Request(img_elem.attrs['src'],
                          self.parse_image,
                          meta={'item': item})

    def parse_image(self, response):
        body = response.body
        item = response.meta['item']
        if body is not None and body != b'':
            item['image_data'] = body
            item['image_filename'] = response.url.split('/')[-1]
        yield item
    def sub_category_parse(self, response):
        '''Each sub category page is parsed to yield the 
        parent category, the current category and 
        the url and number of titles at this level. 
        Then, if none of the sub-categories are in bold
        (indicating end of path), each sub-category is scraped and
        passed recursively to this function.
        '''

        audible_url = 'https://www.audible.com'

        #attempt to get patent category if avalible otherwise tag with Audible
        # as root.
        try:
            parent_category = response.xpath(
                ".//div[@id='center-0']//a[contains(@class,'parentCategoryUrl')]/text()"
            ).extract()[0]
            parent_url = audible_url + response.xpath(
                ".//div[@id='center-0']//a[contains(@class,'parentCategoryUrl')]/@href"
            ).extract()[0]
        except:
            parent_category = "Audible"
            parent_url = "https://www.audible.com/categories"

        #get category name
        cat_name = response.xpath(
            ".//div[@id = 'center-0']//h1/text()").extract()[0]

        #get number of titles and clean input
        cat_number = response.xpath(
            ".//div[@id = 'center-0']//span/text()").extract()[0]
        cat_number = re.sub("[^0-9]", "", cat_number)

        #get Best Sellers list URL or "See all in __" URL as alternative.
        try:
            title_list_link = audible_url + response.xpath(
                ".//a[contains(@aria-label, 'View all in Best sellers')][1]/@href"
            ).extract()[0]
        except:
            title_list_link = response.xpath(
                ".//a[contains(@class,'allInCategoryPageLink')]/@href"
            ).extract()[0]
            title_list_link = audible_url + title_list_link

        #Detects if any of the Sub-categories have a bold text, indicating no further sub-categories.
        sub_cat_bold = response.xpath(
            ".//div[@id='center-0']//ul[contains(@class,'bc-list')]//span[contains(@class,'bc-text-bold')]"
        )

        if sub_cat_bold != []:
            leaf_flag = True
        else:
            leaf_flag = False

        #Create object and store information for CSV storage
        category_entry = CategoryItem()
        category_entry['parent_category'] = parent_category
        category_entry['parent_url'] = parent_url
        category_entry['self_url'] = response.url
        category_entry['category_name'] = cat_name
        category_entry['category_numb_title'] = cat_number
        category_entry['title_list_url'] = title_list_link
        category_entry['leaf_flag'] = leaf_flag
        yield (category_entry)

        #Sub-category url list for current page
        sub_cat_list = response.xpath(
            ".//div[@id='center-0']//ul[contains(@class,'bc-list')]//li[@class='bc-list-item']/a/@href"
        ).extract()

        #If there are no bold sub-categories, for each sub category
        # request page and scrapy with sub_category_parse.
        if sub_cat_bold == []:
            for sub_cat in sub_cat_list:
                url = audible_url + sub_cat
                yield Request(url=url, callback=self.sub_category_parse)
示例#21
0
 def get_media_requests(self, item, info):
     for image_url in item['image_urls']:
         yield Request(image_url)
示例#22
0
 def start_requests(self):
     url_str = 'https://www.51job.com/zhengzhou/'
     yield Request(url=url_str,
                   callback=self.parse,
                   dont_filter=True,
                   meta={'page': '0'})
示例#23
0
    def process_spider_output(self, response, result, spider):
        def get_totla_page(response):
            total_page = 1
            if debug:
                return 1
            try:
                t = response.xpath(
                    '//*[@id="PageNavigator1_LblPageCount"]/text()'
                ).extract_first()
                total_page = int(t)
            except:
                import traceback
                traceback.print_exc()
            return total_page

        if not (200 <= response.status < 300):  # common case
            return result if result else []
        if response.meta.get('PageType') != 'ProjectBase':
            return result if result else []
        # print('ProjectBaseHandleMiddleware')
        result = list(result)
        if response.request.method == 'GET':
            total_page = get_totla_page(response)
            for page in range(1, total_page + 1):
                req_dict = {
                    'PageNavigator1$txtNewPageIndex':
                    str(page),
                    'txtPrjName':
                    '',
                    'txtYsxkz':
                    '',
                    'txtKfsName':
                    '',
                    'txtPrjAdress':
                    '',
                    '__EVENTARGUMENT':
                    '',
                    '__EVENTTARGET':
                    'PageNavigator1$LnkBtnGoto',
                    '__EVENTVALIDATION':
                    response.xpath(
                        '//*[@id="__EVENTVALIDATION"]/@value').extract_first(),
                    '__VIEWSTATE':
                    response.xpath(
                        '//*[@id="__VIEWSTATE"]/@value').extract_first(),
                }
                req = Request(
                    url='http://www.ytfcjy.com/public/project/ProjectList.aspx',
                    headers=self.settings.get('POST_DEFAULT_REQUEST_HEADERS'),
                    body=urlparse.urlencode(req_dict),
                    method='POST',
                    dont_filter=True,
                    meta={'PageType': 'ProjectBase'})
                result.append(req)
        else:
            tr_arr = response.xpath('//tr[@class="TR_BG_list"]')
            for tr in tr_arr:
                projectBaseItem = ProjectBaseItem()
                href = 'http://www.ytfcjy.com/public/project/' + tr.xpath(
                    'td[2]/a/@href').extract_first()
                projectBaseItem['SourceUrl'] = href
                projectBaseItem['ProjectUUID'] = uuid.uuid3(
                    uuid.NAMESPACE_DNS, href)
                projectBaseItem['ProjectName'] = tr.xpath(
                    'td[2]/a/u/text()').extract_first()
                projectBaseItem['Developer'] = tr.xpath(
                    'td[3]/text()').extract_first()
                projectBaseItem['ProjectAddress'] = tr.xpath(
                    'td[4]/text()').extract_first()
                projectBaseItem['PresalePermitNumber'] = tr.xpath(
                    'td[5]/text()').extract_first()
                projectBaseItem['SoldAmount'] = tr.xpath(
                    'td[6]/text()').extract_first()
                projectBaseItem['UnsoldAmount'] = tr.xpath(
                    'td[7]/text()').extract_first()
                result.append(projectBaseItem)

                projectInfo_req = Request(
                    url=projectBaseItem['SourceUrl'],
                    headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
                    dont_filter=True,
                    meta={'PageType': 'ProjectInfo'})
                result.append(projectInfo_req)
        return result
示例#24
0
 def start_requests(self):
     for url in self.start_urls:
         yield Request(url, headers=self.headers, dont_filter=True)
示例#25
0
 def next_request(self, message: BaseRmqMessage) -> Request:
     return Request('https://httpstat.us/200', dont_filter=True)
示例#26
0
 def start_requests(self):
     for base_url in self.base_urls:
         yield Request(url=base_url, callback=self.parse)
示例#27
0
 def parse_first_callback(self, response, index: int):
     self.logger.info(f'INDEX - {index}')
     yield Request('https://httpstat.us/201',
                   callback=self.parse_second_callback,
                   dont_filter=True)
示例#28
0
    def process_spider_output(self, response, result, spider):

        out_come = cheack_response(pagetype=['pd_url', 'pd_url2'],
                                   response=response,
                                   result=result)

        outcome_list, pagetype = [], response.meta['PageType']

        def clean_district(status):
            if status in ("河南岸街道办", "江北街道办", "江南街道办", '桥东街道办', '桥西街道办',
                          '水口街道办', '小金口街道办', '龙丰街道办', '汝湖镇', '马安镇', '三栋镇',
                          '横沥镇', '芦洲镇'):
                return '惠城区'

            elif status in ("惠环街道办", "陈江街道办", "潼湖镇", "潼侨镇", "沥林镇", "东江科技园",
                            "惠南科技园"):
                return '仲恺区'

            elif status in ("澳头街道办", "西区街道办", "霞涌街道办", "大亚湾中心区"):

                return "大亚湾"

            elif status in ("罗阳镇", "石湾镇", "罗浮山", "园洲镇", "龙溪镇", "杨村镇", "泰美镇",
                            "长宁镇", "观音阁镇", "石坝镇", "麻陂镇", "公庄镇", "湖镇镇", "横河镇",
                            "龙华镇", "福田镇", "柏塘镇", "杨侨镇"):

                return "博罗县"

            elif status in ("平山街道", "巽寮滨海旅游度假区", "港口滨海旅游度假区", "稔山镇", "平海镇",
                            "黄埠镇", "多祝镇", "白花镇", "安墩镇", "大岭镇", "梁化镇", "铁涌镇",
                            "吉隆镇", "宝口镇", "高潭镇", "白盆珠镇"):

                return "惠东县"

            else:
                return "龙门县"

        if (out_come == 'right') and (pagetype == 'pd_url'):

            item_pd = response.meta['item']

            url_next = response.xpath(
                '//*[@id="house"]/div[7]/dl[3]/dt/div/a/@href').extract_first(
                )

            re_get = Request(url=url_next,
                             method='GET',
                             headers=self.headers,
                             meta={
                                 'PageType': 'pd_url2',
                                 "item": item_pd
                             },
                             dont_filter=True)

            outcome_list.append(re_get)

            return outcome_list

        elif (out_come == 'right') and (pagetype == 'pd_url2'):

            item_pd = response.meta['item']

            item_pd["RealEstateProjectID"] = re.findall(
                r'num\=(.+)', response.url)[0]

            item_pd['RegionName'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[1]/td[1]/text()'
                ).extract_first())

            item_pd['ProjectBlock'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[1]/td[2]/text()'
                ).extract_first())

            item_pd['BuildingType'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[2]/td[1]/text()'
                ).extract_first())

            item_pd['ProjectHouseType'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[2]/td[2]/text()'
                ).extract_first())

            item_pd["AveragePrice"] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[3]/td[1]/text()'
                ).extract_first())

            item_pd['ProjectMainShape'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[3]/td[2]/text()'
                ).extract_first())

            item_pd['FloorArea'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[1]/text()'
                ).extract_first())

            item_pd['TotalBuidlingArea'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[2]/text()'
                ).extract_first())

            item_pd['HouseBuildingCount'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[5]/td[1]/text()'
                ).extract_first())

            item_pd['HousingCount'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[5]/td[2]/text()'
                ).extract_first())

            item_pd['GreeningRate'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[6]/td[2]/text()'
                ).extract_first())

            item_pd['FloorAreaRatio'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[6]/td[1]/text()'
                ).extract_first())

            item_pd['PropertyRightsDescription'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[7]/td[2]/text()'
                ).extract_first())

            item_pd['ParkingSpaceAmount'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[8]/td[1]/text()'
                ).extract_first())

            item_pd['ParkingSpaceMatching'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[8]/td[2]/text()'
                ).extract_first())

            # item_pd['EarliestOpeningTime']   = clean_rule1(response.xpath('//*[@id="house"]/div[2]/div[5]/table/tr[8]/td[2]/text()').extract_first())

            # item_pd['FloorArea']             = clean_rule1(response.xpath('//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[1]/text()').extract_first())

            item_pd['ProjectAddress'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[19]/td/text()').
                extract_first())

            item_pd['TotalBuidlingArea'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/tr[4]/td[2]/text()').
                extract_first())

            item_pd['ProjectLandNumber'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[2]/div[5]/table/tr[10]/td[2]/text()'
                ).extract_first())

            item_pd['ManagementFees'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[9]/td[2]/text()'
                ).extract_first())

            item_pd['ManagementCompany'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[9]/td[1]/text()'
                ).extract_first())

            item_pd['EarliestOpeningTime'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[10]/td[1]/text()'
                ).extract_first())

            item_pd['Project_LivingTime'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[10]/td[2]/text()'
                ).extract_first())

            item_pd['Decoration'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[12]/td[1]/text()'
                ).extract_first())

            item_pd['ProjectLandNumber'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[17]/td/text()').
                extract_first())

            item_pd['Project_Feature'] = clean_rule1(
                response.xpath(
                    '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[20]/td/text()').
                extract_first())

            item_pd['Project_Traffic'] = clean_rule2(
                response.xpath('//*[@id="house"]/div[3]/dl[4]/dd/p[1]/text()').
                extract_first())

            item_pd['Project_Introduce'] = clean_rule2(
                response.xpath('//*[@id="house"]/div[3]/dl[2]/dd/p/text()').
                extract_first())

            item_pd['Project_Surround'] = clean_rule2(
                response.xpath('//*[@id="house"]/div[3]/dl[3]/dd/p[1]/text()').
                extract_first())

            item_pd['ProjectUrl'] = response.url

            item_pd['DistrictName'] = clean_district(item_pd['RegionName'])

            outcome_list.append(item_pd)

            return outcome_list

        else:

            return out_come
 def nextPage(self, response):  #中间页解析函数
     tempUrl = "https://yz.chsi.com.cn" +\
               response.xpath("//div[@class='yxk-index-con']/div[2]/ul/li[last()-2]/a/@href").extract()[0]
     item = response.meta["item"]
     url = tempUrl
     yield Request(url, meta={"item": item}, callback=self.typePase)
示例#30
0
            'KINGFISHER_PLUCK_MAX_BYTES': 10
        },
                                     release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)
        request = Request('http://example.com',
                          meta={'file_name': 'test.json'})

        extension.bytes_received(data=b'12345', spider=spider, request=request)

        assert extension.total_bytes_received == 5
        assert extension.max_bytes == 10


@pytest.mark.parametrize('test_request,spider_class,attributes', [
    (Request('http://example.com',
             callback=lambda item: item,
             meta={'file_name': 'test.json'}), BaseSpider, {}),
    (Request('http://example.com', meta={'file_name': 'test.rar'
                                         }), CompressedFileSpider, {}),
    (Request('http://example.com', meta={'file_name': 'test.zip'
                                         }), CompressedFileSpider, {}),
    (Request('http://example.com', meta={'file_name': 'test.xlsx'
                                         }), BaseSpider, {
                                             'unflatten': True
                                         }),
    (Request('http://example.com', meta={'file_name': 'test.json'
                                         }), BaseSpider, {
                                             'root_path': 'item'
                                         }),
    (Request('http://example.com', meta={'file_name': 'test.json'
                                         }), BaseSpider, {