def get_media_requests(self, item, info): item['proxy'] = VSpider.proxy yield Request(item['src'], meta=item)
def parse(self, response, **kwargs): for index in range(16): yield Request('https://httpstat.us/200', callback=self.parse_first_callback, dont_filter=True, cb_kwargs={'index': index})
def start_requests(self): #初始请求 url = "https://yz.chsi.com.cn/sch/" yield Request(url)
def process_spider_output(self, response, result, spider): def get_dict(string): if string is None: return None try: d = {} arr = string.split(';;') for a in arr: infos = a.split(',,') d[infos[1]] = infos[0] return d except: return None def get_infos(string): d = get_dict(string) try: values = [] for key, value in d.items(): values.append(key) return ','.join(values) except: return '' if not (200 <= response.status < 300): # common case return result if result else [] if response.meta.get('PageType') != 'ProjectInfo': return result if result else [] result = list(result) # print('ProjectInfoHandleMiddleware') projectInfoItem = ProjectInfoItem() projectInfoItem['SourceUrl'] = response.url projectInfoItem['ProjectUUID'] = uuid.uuid3(uuid.NAMESPACE_DNS, response.url) projectInfoItem['ProjectNO'] = response.xpath( '//*[@id="PROJECT_XMBH"]/text()').extract_first() projectInfoItem['ProjectName'] = response.xpath( '//*[@id="PROJECT_XMMC_1"]/text()').extract_first() projectInfoItem['Developer'] = response.xpath( '//*[@id="PROJECT_KFQY_NAME"]/text()').extract_first() projectInfoItem['ProjectAddress'] = response.xpath( '//*[@id="PROJECT_XMDZ"]/text()').extract_first() projectInfoItem['DistrictName'] = response.xpath( '//*[@id="PROJECT_SZQY"]/text()').extract_first() projectInfoItem['FloorAreaRatio'] = response.xpath( '//*[@id="PROJECT_RJL"]/text()').extract_first() projectInfoItem['TotalBuidlingArea'] = response.xpath( '//*[@id="PROJECT_GHZJZMJ"]/text()').extract_first() projectInfoItem['PlanningAcceptanceDate'] = response.xpath( '//*[@id="PROJECT_GHYSRQ"]/text()').extract_first() projectInfoItem['ComprehensiveAcceptanceDate'] = response.xpath( '//*[@id="PROJECT_ZHYSRQ"]/text()').extract_first() projectInfoItem['PlanInvest'] = response.xpath( '//*[@id="PROJECT_JHZTZ"]/text()').extract_first() projectInfoItem['PresalePermitNumber'] = response.xpath( '//*[@id="YSXKZH"]/text()').extract_first() projectInfoItem['SoldAmount'] = response.xpath( '//*[@id="YSZTS"]/text()').extract_first() projectInfoItem['UnsoldAmount'] = response.xpath( '//*[@id="WSZTS"]/text()').extract_first() projectInfoItem['SoldArea'] = response.xpath( '//*[@id="YSZMJ"]/text()').extract_first() projectInfoItem['UnsoldArea'] = response.xpath( '//*[@id="WSZMJ"]/text()').extract_first() projectInfoItem['CheckOutAmount'] = response.xpath( '//*[@id="TFCS"]/text()').extract_first() projectInfoItem['CheckOutRatio'] = response.xpath( '//*[@id="TFL"]/text()').extract_first() projectInfoItem['SellAddress'] = response.xpath( '//*[@id="PROJECT_SLCDH"]/text()').extract_first() projectInfoItem['Selltel'] = response.xpath( '//*[@id="PROJECT_SLDH"]/text()').extract_first() projectInfoItem['DesignUnit'] = response.xpath( '//*[@id="PROJECT_SJDW"]/text()').extract_first() projectInfoItem['ConstructionUnit'] = response.xpath( '//*[@id="PROJECT_SGDW"]/text()').extract_first() projectInfoItem['SupervisionUnit'] = response.xpath( '//*[@id="PROJECT_JLDW"]/text()').extract_first() projectInfoItem['ManagementCompany'] = response.xpath( '//*[@id="PROJECT_WYGLGS"]/text()').extract_first() projectInfoItem['ProjectSupporting'] = response.xpath( '//*[@id="PROJECT_XMPT"]/text()').extract_first() projectInfoItem['AroundSupporting'] = response.xpath( '//*[@id="PROJECT_ZBPT"]/text()').extract_first() projectInfoItem['ProjectIntro'] = response.xpath( '//*[@id="PROJECT_XMJL"]/text()').extract_first() projectInfoItem['TodayHousingSoldAmount'] = response.xpath( '//*[@id="ZZ_JRYSTS"]/text()').extract_first() projectInfoItem['TodayHousingSoldArea'] = response.xpath( '//*[@id="ZZ_JRYSMJ"]/text()').extract_first() projectInfoItem['HousingSoldAmount'] = response.xpath( '//*[@id="ZZ_LJYSTS"]/text()').extract_first() projectInfoItem['HousingSoldArea'] = response.xpath( '//*[@id="ZZ_LJYSMJ"]/text()').extract_first() projectInfoItem['HousingUnsoldAmount'] = response.xpath( '//*[@id="ZZ_WSTS"]/text()').extract_first() projectInfoItem['HousingUnsoldArea'] = response.xpath( '//*[@id="ZZ_WSMJ"]/text()').extract_first() projectInfoItem['TodayShopSoldAmount'] = response.xpath( '//*[@id="SY_JRYSTS"]/text()').extract_first() projectInfoItem['TodayShopSoldArea'] = response.xpath( '//*[@id="SY_JRYSMJ"]/text()').extract_first() projectInfoItem['ShopSoldAmount'] = response.xpath( '//*[@id="SY_LJYSTS"]/text()').extract_first() projectInfoItem['ShopSoldArea'] = response.xpath( '//*[@id="SY_LJYSMJ"]/text()').extract_first() projectInfoItem['ShopUnsoldAmount'] = response.xpath( '//*[@id="SY_WSTS"]/text()').extract_first() projectInfoItem['ShopUnsoldArea'] = response.xpath( '//*[@id="SY_WSMJ"]/text()').extract_first() projectInfoItem['TodayOfficeSoldAmount'] = response.xpath( '//*[@id="BG_JRYSTS"]/text()').extract_first() projectInfoItem['TodayOfficeSoldArea'] = response.xpath( '//*[@id="BG_JRYSMJ"]/text()').extract_first() projectInfoItem['OfficeSoldAmount'] = response.xpath( '//*[@id="BG_LJYSTS"]/text()').extract_first() projectInfoItem['OfficeSoldArea'] = response.xpath( '//*[@id="BG_LJYSMJ"]/text()').extract_first() projectInfoItem['OfficeUnsoldAmount'] = response.xpath( '//*[@id="BG_WSTS"]/text()').extract_first() projectInfoItem['OfficeUnsoldArea'] = response.xpath( '//*[@id="BG_WSMJ"]/text()').extract_first() projectInfoItem['TodayOtherSoldAmount'] = response.xpath( '//*[@id="QT_JRYSTS"]/text()').extract_first() projectInfoItem['TodayOtherSoldArea'] = response.xpath( '//*[@id="QT_JRYSMJ"]/text()').extract_first() projectInfoItem['OtherSoldAmount'] = response.xpath( '//*[@id="QT_LJYSTS"]/text()').extract_first() projectInfoItem['OtherSoldArea'] = response.xpath( '//*[@id="QT_LJYSMJ"]/text()').extract_first() projectInfoItem['OtherUnsoldAmount'] = response.xpath( '//*[@id="QT_WSTS"]/text()').extract_first() projectInfoItem['OtherUnsoldArea'] = response.xpath( '//*[@id="QT_WSMJ"]/text()').extract_first() # 土地证 tdzInfo = response.xpath('//*[@id="tdzInfo"]/@value').extract_first() projectInfoItem['CertificateOfUseOfStateOwnedLand'] = get_infos( tdzInfo) # 施工许可证 sgxkzInfo = response.xpath( '//*[@id="sgxkzInfo"]/@value').extract_first() projectInfoItem['ConstructionPermitNumber'] = get_infos(sgxkzInfo) # 用地规划许可证 jsydghxkzInfo = response.xpath( '//*[@id="ghxkzInfo"]/@value').extract_first() projectInfoItem['LandUsePermit'] = get_infos(jsydghxkzInfo) # 工程规划许可证 ghxkzInfo = response.xpath( '//*[@id="ghxkzInfo"]/@value').extract_first() projectInfoItem['BuildingPermit'] = get_infos(ghxkzInfo) result.append(projectInfoItem) # 预售证信息 presellInfo = response.xpath( '//*[@id="presellInfo"]/@value').extract_first() presellInfo_dict = get_dict(presellInfo) if presellInfo_dict: for key, value in presellInfo_dict.items(): if value: url = 'http://www.ytfcjy.com/public/project/presellCertInfo.aspx?code={code}'.format( code=value) presell_info_req = Request( url=url, headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), dont_filter=True, meta={ 'PageType': 'PresellInfo', 'ProjectUUID': str(projectInfoItem['ProjectUUID']), 'ProjectName': projectInfoItem['ProjectName'], }) result.append(presell_info_req) # 楼栋信息 buildingInfo = response.xpath( '//*[@id="buildInfo"]/@value').extract_first() buildingInfo_dict = get_dict(buildingInfo) if buildingInfo_dict: for key, value in buildingInfo_dict.items(): if value: # 销控表的列表访问链接 url = 'http://www.ytfcjy.com/public/project/RoomList.aspx?code={code}&rsr=1001&rse=0&jzmj=&tnmj='.format( code=value) buildingInfoItem = BuildingInfoItem() buildingInfoItem['SourceUrl'] = url buildingInfoItem['ProjectUUID'] = projectInfoItem[ 'ProjectUUID'] buildingInfoItem['ProjectName'] = projectInfoItem[ 'ProjectName'] buildingInfoItem['BuildingUUID'] = uuid.uuid3( uuid.NAMESPACE_DNS, url) buildingInfoItem['BuildingName'] = key[:key.index('(')] buildingInfoItem['BuildingID'] = value result.append(buildingInfoItem) body = '%3C?xml%20version=%221.0%22%20encoding=%22utf-8%22%20standalone=%22yes%22?%3E%0A%3Cparam%20funname=%22SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx%22%3E%0A%3Citem%3E{BuildingID}%3C/item%3E%0A%3Citem%3E1%3C/item%3E%0A%3Citem%3E1%3C/item%3E%0A%3Citem%3E80%3C/item%3E%0A%3Citem%3E720%3C/item%3E%0A%3Citem%3Eg_oBuildTable%3C/item%3E%0A%3Citem%3E%201=1%3C/item%3E%0A%3C/param%3E%0A' building_info_req = Request( url= 'http://www.ytfcjy.com/Common/Agents/ExeFunCommon.aspx?&req={time}' .format(time=round(time.time() * 1000)), headers={ 'Host': 'www.ytfcjy.com', 'Connection': 'keep-alive', 'Origin': 'http:/www.ytfcjy.com', 'Content-Type': 'text/plain;charset=UTF-8', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', }, dont_filter=True, method='POST', body=body.format(BuildingID=value), meta={ 'PageType': 'HouseList', 'ProjectUUID': str(projectInfoItem['ProjectUUID']), 'ProjectName': projectInfoItem['ProjectName'], 'BuildingName': buildingInfoItem['BuildingName'], 'BuildingUUID': str(buildingInfoItem['BuildingUUID']), }) result.append(building_info_req) return result
def parse_list(self, response): print('parse_list response.url:' + response.url) self.logger.debug('parse_list response.url:' + response.url) item = GanjiShoprentalItem() div = response.css('div.f-list.js-tips-list > div') print('parse_list div:{} response.url: {}'.format( div.css('dd.dd-item.title > a::text').extract(), response.url)) self.logger.debug('parse_list div:{} response.url: {}'.format( div.css('dd.dd-item.title > a::text').extract(), response.url)) for i in div: item['title'] = i.css('dd.dd-item.title > a::text').extract_first() u = i.css('dd.dd-item.title > a::attr(href)').extract_first() if u.startswith('http'): item['url'] = u item['number'] = item['url'].split('?')[0].split('/')[-2] else: item['url'] = 'http://sz.ganji.com' + u item['number'] = item['url'].split('?')[0].split('/')[-1] if i.css('.unit::text').extract_first(): item['month_price'] = i.css('.num::text').extract_first( ) + i.css('.unit::text').extract_first() else: item['month_price'] = i.css('.num::text').extract_first() item['day_price'] = i.css('.small-price::text').extract_first() if i.css('img[data-original]'): item['img'] = i.css('img::attr(data-original)').extract_first() else: item['img'] = i.css('img::attr(src)').extract_first() con = i.css('dd.dd-item.size > span::text').extract() if len(con) == 3: item['area'] = float( re.findall('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0', con[0].strip())[0]) item['floor'] = con[1].strip() item['type'] = con[2].strip() # elif len(con) == 2: # item['area'] = float(re.findall('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0', con[0].strip())[0]) # item['floor'] = con[1].strip() # item['type'] = con[2].strip() else: item['area'] = 0 item['floor'] = '' item['type'] = '' site = i.css('dd.dd-item.address > span.area > a::text').extract() if len(site) == 3: item['district'] = site[0].strip() item['location'] = site[1].strip().strip(' - ') item['address'] = site[2].strip().strip(' - ') elif len(site) == 2: item['district'] = site[0].strip() item['location'] = '' item['address'] = site[1].strip().strip(' - ') dd = i.css('dd.dd-item.source > span::text').extract() if len(dd) == 3: item['transfer'] = dd[0].strip() item['status'] = dd[1].strip().strip(' - ') item['industry'] = dd[2].strip().strip(' - ') elif len(dd) == 2: item['transfer'] = dd[0].strip() item['status'] = dd[1].strip().strip(' - ') item['industry'] = '' # getlocation(item) # yield item yield Request(url=item['url'], meta={'item': item}, callback=self.parse_details) le = LinkExtractor(restrict_css='ul.pageLink a.next') print('5' * 200) links = le.extract_links(response) if links: next_url = links[0].url print('next_url:', next_url) self.logger.debug('next_url:' + next_url) yield Request(next_url, callback=self.parse_list)
def parse(self, response): item = ClubCircleDetailScrapyItem() yield Request(url=response.url, callback=self.parse_club_circle_detail_items, meta={"item": copy.deepcopy(item)}, dont_filter=True)
def start_requests(self): for year in range(2007, 2017): yield Request(self.BASE_URL(year), meta={'year': year})
def parseNews(self, response): self.response_body_decode(response) sel = Selector(response) homeurl = tools.getHomeUrl(response.url) brandname = response.meta['brandname'] news = None # news保存新闻主体部分的SelectorList pagerule = None # 判断是否已经可以确定页面规则 if response.meta.has_key('pagerule'): pagerule = response.meta['pagerule'] news = sel.xpath(pagerule['pageform']) else: # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取 for each_rule in newspage_type.page_rules: news = sel.xpath(each_rule['pageform']) if len(news) > 0: pagerule = each_rule break if pagerule is None: raise ValueError('Error processing (' + response.url + ') This page do not have corresponding rules') # 获得allpage 和 nextpage url if pagerule['allpage'] is None: allpage = [] else: allpage = news.xpath(pagerule['allpage']).extract() if pagerule['nextpage'] is None: nextpage = [] else: nextpage = news.xpath(pagerule['nextpage']).extract() # 如果包含全页阅读的url,则进行该处理 if len(allpage) > 0: if tools.isCompleteUrl(allpage[0]): url = allpage[0] else: url = homeurl + allpage[0] r = Request(url, callback=self.parseNews) r.meta['brandname'] = brandname r.meta['pagerule'] = pagerule yield r elif len(nextpage) > 0: # 如果包含下一页,则进行该处理 if tools.isCompleteUrl(nextpage[0]): url = nextpage[0] else: url = homeurl + nextpage[0] # 提取当前页面的title, date, content,保存到article中,传递至下一请求 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) article = { 'brandname': brandname, 'title': title, 'date': date, 'content': content } r = Request(url, callback=self.parseNextPage) r.meta['article'] = article r.meta['pagerule'] = pagerule yield r else: # 如果新闻只有一页,则直接提取新闻内容 title = news.xpath(pagerule['title']).extract() date = self.getDate(news, response.url, pagerule['date']) content = self.getContent(news, pagerule['content']) item = NewsItem() item['brandname'] = brandname item['date'] = date item['title'] = "".join(title) item['content'] = "".join(content) yield item
def get_novel_url(self, response): novel_urls = response.xpath( '//*[@id="results"]/div[3]/div/div[2]/h3/a/@href').extract() for i in novel_urls: yield Request(i, callback=self.parse_novel_link)
def start_requests(self): yield Request(url=self.START_URL, callback=self.parse_page, dont_filter=True, headers=self.headers )
def parse_list(self, response): pageNo = response.meta['pageNo'] list = response.xpath('//*[@id="data_list"]/tr') if len(list) == 1 and response.xpath( '//*[@id="data_list"]/tr/td/text()').extract_first( ) == u'没有可显示资源': self.logger.info('---%s,%s---' % (response.url, u'没有数据')) return self.logger.info('---pageNo:%s,%s---' % (pageNo, response.url)) record_not_exist = True for item in list: # record_not_exist = True if len(item.xpath('td')) == 1: continue movie = MovieItem() movie['site'] = self.site movie['type'] = item.xpath('td[2]/a/text()').extract_first() movie['full_name'] = item.xpath( 'td[3]/a/text()').extract_first().replace('\r', '').replace( '\n', '').strip() movie['name'] = movie['full_name'].split('.')[0] movie['total'] = item.xpath('td[4]/text()').extract_first() movie['page_url'] = self.base + item.xpath( 'td[3]/a/@href').extract_first() movie['id'] = '%s_%s' % (self.site, re.search('\?hash=(\w+)', movie['page_url']).group(1)) if item.xpath('td[8]/a/text()').extract_first() != u'高清MP4吧': print item.xpath('td[8]/a/text()').extract_first() if self.blogs.isExistsMoviesByid(movie['id']): # self.logger.info( # '*****************type:%s,pageNo:%s,record exist! crawl total count:%s,title%s*****************' % ( # movie['type'], pageNo, self.count, movie['full_name'])) record_not_exist = False continue else: self.logger.info('------lastest file,type:%s,title%s-------' % (movie['type'], movie['full_name'])) yield Request( movie['page_url'], meta={ 'type': 'detail', 'movie': movie, 'download_path': response.meta['download_path'] }, dont_filter=True, headers={ 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" }) next_page = response.xpath( '//*[@class="pages clear"]/a[@class="nextprev"]/@href').extract() if next_page and record_not_exist: next_page_no = int(re.search('page=(\d+)', next_page[-1]).group(1)) if next_page_no > pageNo: yield Request( "http://www.mp4ba.com/%s" % next_page[-1], meta={ 'type': 'list', 'pageNo': next_page_no, 'download_path': response.meta['download_path'] }, dont_filter=True, headers={ 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" })
def parse(self, response): for page_index in range(1, 500): yield Request(url='https://www.luoqiu.com/top/lastupdate_' + str(page_index) + '.html', callback=self.parse_page)
def start_requests(self): for url in self.start_urls: yield Request(url, headers=self.headers)
def start_requests(self): def mk_url_headers_body(page, key): def quote_val(url): return re.sub( r'([\?&][^=&]*=)([^&]*)', lambda i: i.group(1) + quote(unquote(i.group(2), encoding='utf-8'), encoding='utf-8'), url) url = ( 'https://www.google.com/_/VisualFrontendUi/data/batchexecute' '?rpcids=HoAMBc' '&f.sid=1484552913048631198' '&bl=boq_visualfrontendserver_20200214.01_p1' '&hl=en-US' '&authuser' '&soc-app=162' '&soc-platform=1' '&soc-device=1' '&_reqid=142196' '&rt=c') url = quote_val(url) headers = { "accept": "*/*", "accept-encoding": "gzip, deflate, ", # auto delete br encoding. cos requests and scrapy can not decode it. "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", "content-type": "application/x-www-form-urlencoded;charset=UTF-8", "cookie": ("DV=MzB0UiguEPQtwDWaKe0v0OimEEJmBddGUz1EEumeBAIAAAA; " "NID=198=SU1N0X0TlV1FfnWH1NVNJ2OmHzx7hnM2Bb1Iwgkh-7h6wpDFwIsf8DK1vZOwU7G4ZaGc6bGca2ZsdoMbB8uhfezMREiX9T53Ldv0GOq-KXT3q9Z4Y18rDl5Coes2SoHfd69mtDk7XmmtFgi0z0s8Zh-GyRA02IPjbrMqZrSnv2k; " "OTZ=5328196_24_24__24_; " "1P_JAR=2020-02-18-03"), "origin": "https://www.google.com", "referer": "https://www.google.com/", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", "x-client-data": "CLO1yQEIhrbJAQiitskBCMG2yQEIqZ3KAQjiqMoBCMuuygEIyq/KAQjOsMoBCPe0ygEIjrrKAQ==", "x-goog-ext-190139975-jspb": "[\"ZZ\"]", "x-same-domain": "1" } body = { "f.req": "[[[\"HoAMBc\",\"[null,null,[" + str(page) + ",null,450,1,1280,[[\\\"m_jkppg-2ek_iM\\\",200,200,-2147352321]],[],[],null,null,null,534],\\\"\\\",\\\"\\\",null,null,null,null,null,null,\\\"\\\",null,null,null,null,null,null,null,null,null,null,null,null,\\\"\\\",null,null,null,[\\\"" + key + "\\\",\\\"\\\",null,\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\",null,null,\\\"\\\",\\\"\\\",\\\"\\\",\\\"\\\"]]\",null,\"generic\"]]]" } return url, headers, body keys = ['狗'] for key in keys: for page in range(1, 80): url, headers, body = mk_url_headers_body(page, key) meta = {} meta['key'] = key meta['proxy'] = self.proxy r = Request( url, method='POST', headers=headers, body=urlencode(body), callback=self.parse, meta=meta, ) yield r break break
def process_spider_output(self, response, result, spider): out_come = cheack_response(pagetype=['bd_url', 'bd_url2'], response=response, result=result) outcome_list, pagetype = [], response.meta['PageType'] if (out_come == 'right') and (pagetype == 'bd_url'): cheack_key = response.xpath( '//*[@id="house"]/div[3]/dl/dd/table/tr[2]/td/font/text()' ).extract_first() record_dict = response.meta['Record_Data'] if cheack_key != '暂无数据!': BuildingItem = Building_Detail_Item() item_cd = Certificate_Detail_Item() BuildingItem['ProjectName'] = record_dict['ProjectName'] BuildingItem['ProjectUUID'] = record_dict['ProjectUUID'] item_cd['ProjectName'] = record_dict['ProjectName'] item_cd['ProjectUUID'] = record_dict['ProjectUUID'] content = response.xpath( '//*[@id="house"]/div[3]/dl/dd/table/tr') for i in content[1:]: item_cd['PresalePermitUrl'] = clean_rule1( i.xpath('./td[4]/a/@href').extract_first()) item_cd['PresalePermitNumber'] = clean_rule1( i.xpath('./td[4]/a/text()').extract_first()) BuildingItem['PresalePermitNumber'] = item_cd[ 'PresalePermitNumber'] BuildingItem['SourceUrl'] = response.url if item_cd['PresalePermitNumber'] != '': item_cd['PresalePermitNumberUUID'] = str( uuid.uuid3(uuid.NAMESPACE_DNS, item_cd['PresalePermitNumber'])) BuildingItem['PresalePermitNumberUUID'] = item_cd[ 'PresalePermitNumberUUID'] cheack_key2 = re.findall(r'ysz_id', item_cd['PresalePermitUrl']) if cheack_key2: item_cd[ 'PresalePermitUrl'] = 'http://newhouse.fz0752.com' + item_cd[ 'PresalePermitUrl'] re_get2 = Request(url=item_cd['PresalePermitUrl'], method='GET', headers=self.headers, meta={ 'PageType': 'cd_url', "item": item_cd }, dont_filter=True) else: re_get2 = Request(url=item_cd['PresalePermitUrl'], method='GET', headers=self.headers, meta={ 'PageType': 'cd_url', "item": item_cd }, dont_filter=True) outcome_list.append(re_get2) else: BuildingItem['PresalePermitNumberUUID'] = '' BuildingItem['BuildingName'] = clean_rule1( i.xpath('./td[2]/span/text()').extract_first()) BuildingItem['BuildingNumber'] = clean_rule1( i.xpath('./td[3]/text()').extract_first()) BuildingItem['BuildingUrl'] = clean_rule1( i.xpath('./td[8]/a/@href').extract_first()) # 当备案的链接为空时,选用楼盘链接 if BuildingItem['BuildingUrl'] == '': BuildingItem[ 'BuildingUrl'] = 'http://data.fz0752.com' + clean_rule1( i.xpath('./td[7]/a/@href').extract_first()) BuildingItem['BuildingUUID'] = uuid.uuid3( uuid.NAMESPACE_DNS, BuildingItem['ProjectName'] + BuildingItem['BuildingName'] + BuildingItem['BuildingNumber'] + BuildingItem['PresalePermitNumber']) BuildingItem['SourceUrl'] = response.url outcome_list.append(BuildingItem) now_page = response.xpath( '//*[@id="house"]/div[4]/div/span/text()').extract_first() next_page = int(now_page) + 1 if now_page else 0 cheack_page = response.xpath( '//*[@id="house"]/div[4]/div/a[contains(@href,"javascript:goPage({0})")]/@href' .format(next_page)).extract() if cheack_page: url_next = "http://newhouse.fz0752.com/project/selist.shtml?num={0}&old=&pageNO={1}".format( record_dict['ProjectID'], next_page) # print(url_next) re_get = Request(url=url_next, method='GET', headers=self.headers, meta={ 'PageType': 'bd_url', "Record_Data": record_dict }, dont_filter=True) outcome_list.append(re_get) return outcome_list else: # print('BuildingDetailMiddlewareover') return out_come
def start_requests(self): for i in range(5): start_url = 'http://zhannei.baidu.com/cse/search?q=&p={0}&s=920895234054625192&srt=def&nsid=0&entry=1'.format( i) yield Request(start_url, callback=self.get_novel_url)
def process_spider_output(self, response, result, spider): out_come = cheack_response(pagetype=['pl_url'], response=response, result=result) if out_come == 'right': outcome_list, pagetype = [], response.meta['PageType'] print('ProjectGetFidMiddleware') if pagetype == 'pl_url': content = response.xpath('//*[@id="searchResult"]/dd/div') now_page = int( response.xpath( '//*[@id="searchResult"]/dt[2]/h2/div/text()'). extract_first()) crawler_page = response.xpath( '//*[@id="searchResult"]/dt[2]/h2/a[last()]/text()').re( r'\d+')[0] crawler_page = int(crawler_page) # 判断是否第一页,第一页进行翻页 if now_page == 1: for page in range(2, crawler_page + 1): next_url = 'http://newhouse.fz0752.com/project/list.shtml?state=&key=&qy=&area=&danjia=&func=&fea=&type=&kp=&mj=&pageNO={0}'.format( page) list_req = Request(url=next_url, method='GET', headers=self.headers, meta={'PageType': 'pl_url'}, dont_filter=True) outcome_list.append(list_req) # 获取每页列表上的项目 for i in content: item_pd = Project_Detail_Item() ProjectName = clean_rule1( i.xpath('./h1/span[1]/a/text()').extract_first()) ProjectUUID = uuid.uuid3(uuid.NAMESPACE_DNS, ProjectName) ProjectUrl = clean_rule1( i.xpath('./h1/span[1]/a/@href').extract_first()) Developer = clean_rule1( i.xpath('./h2[2]/a/text()').extract_first()) phone = clean_rule1( i.xpath('./h4/span/text()').extract_first()) if ProjectUrl: item_pd['ProjectName'] = ProjectName item_pd['ProjectUUID'] = ProjectUUID item_pd['ProjectUrl'] = ProjectUrl item_pd['Developer'] = Developer item_pd['SaleTelphoneNumber'] = phone re_get = Request(url=item_pd['ProjectUrl'], method='GET', headers=self.headers, meta={ 'PageType': 'pd_url', "item": item_pd }, dont_filter=True) outcome_list.append(re_get) # print(now_page, crawler_page) return outcome_list else: return out_come
def get_media_requests(self, item, info): yield Request(item['url'])
log.warning("%s is not a news item" % response.url) return date_elem = soup.find('p', class_='noticia-data') if not hasattr(date_elem, "parent"): log.warning("Skipped %s" % response.url) return content_elem = date_elem.parent item['datetime'] = make_aware( datetime.strptime(date_elem.text.strip(), '%d-%m-%Y')) content = str(content_elem.find('div', class_="noticia-corpo")) content = mdconverter.handle(content).strip() item['content'] = content item['html'] = response.text item['source'] = response.url img_elem = content_elem.find('img', class_="imagem-noticia") if img_elem is None: yield item else: yield Request(img_elem.attrs['src'], self.parse_image, meta={'item': item}) def parse_image(self, response): body = response.body item = response.meta['item'] if body is not None and body != b'': item['image_data'] = body item['image_filename'] = response.url.split('/')[-1] yield item
def sub_category_parse(self, response): '''Each sub category page is parsed to yield the parent category, the current category and the url and number of titles at this level. Then, if none of the sub-categories are in bold (indicating end of path), each sub-category is scraped and passed recursively to this function. ''' audible_url = 'https://www.audible.com' #attempt to get patent category if avalible otherwise tag with Audible # as root. try: parent_category = response.xpath( ".//div[@id='center-0']//a[contains(@class,'parentCategoryUrl')]/text()" ).extract()[0] parent_url = audible_url + response.xpath( ".//div[@id='center-0']//a[contains(@class,'parentCategoryUrl')]/@href" ).extract()[0] except: parent_category = "Audible" parent_url = "https://www.audible.com/categories" #get category name cat_name = response.xpath( ".//div[@id = 'center-0']//h1/text()").extract()[0] #get number of titles and clean input cat_number = response.xpath( ".//div[@id = 'center-0']//span/text()").extract()[0] cat_number = re.sub("[^0-9]", "", cat_number) #get Best Sellers list URL or "See all in __" URL as alternative. try: title_list_link = audible_url + response.xpath( ".//a[contains(@aria-label, 'View all in Best sellers')][1]/@href" ).extract()[0] except: title_list_link = response.xpath( ".//a[contains(@class,'allInCategoryPageLink')]/@href" ).extract()[0] title_list_link = audible_url + title_list_link #Detects if any of the Sub-categories have a bold text, indicating no further sub-categories. sub_cat_bold = response.xpath( ".//div[@id='center-0']//ul[contains(@class,'bc-list')]//span[contains(@class,'bc-text-bold')]" ) if sub_cat_bold != []: leaf_flag = True else: leaf_flag = False #Create object and store information for CSV storage category_entry = CategoryItem() category_entry['parent_category'] = parent_category category_entry['parent_url'] = parent_url category_entry['self_url'] = response.url category_entry['category_name'] = cat_name category_entry['category_numb_title'] = cat_number category_entry['title_list_url'] = title_list_link category_entry['leaf_flag'] = leaf_flag yield (category_entry) #Sub-category url list for current page sub_cat_list = response.xpath( ".//div[@id='center-0']//ul[contains(@class,'bc-list')]//li[@class='bc-list-item']/a/@href" ).extract() #If there are no bold sub-categories, for each sub category # request page and scrapy with sub_category_parse. if sub_cat_bold == []: for sub_cat in sub_cat_list: url = audible_url + sub_cat yield Request(url=url, callback=self.sub_category_parse)
def get_media_requests(self, item, info): for image_url in item['image_urls']: yield Request(image_url)
def start_requests(self): url_str = 'https://www.51job.com/zhengzhou/' yield Request(url=url_str, callback=self.parse, dont_filter=True, meta={'page': '0'})
def process_spider_output(self, response, result, spider): def get_totla_page(response): total_page = 1 if debug: return 1 try: t = response.xpath( '//*[@id="PageNavigator1_LblPageCount"]/text()' ).extract_first() total_page = int(t) except: import traceback traceback.print_exc() return total_page if not (200 <= response.status < 300): # common case return result if result else [] if response.meta.get('PageType') != 'ProjectBase': return result if result else [] # print('ProjectBaseHandleMiddleware') result = list(result) if response.request.method == 'GET': total_page = get_totla_page(response) for page in range(1, total_page + 1): req_dict = { 'PageNavigator1$txtNewPageIndex': str(page), 'txtPrjName': '', 'txtYsxkz': '', 'txtKfsName': '', 'txtPrjAdress': '', '__EVENTARGUMENT': '', '__EVENTTARGET': 'PageNavigator1$LnkBtnGoto', '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), } req = Request( url='http://www.ytfcjy.com/public/project/ProjectList.aspx', headers=self.settings.get('POST_DEFAULT_REQUEST_HEADERS'), body=urlparse.urlencode(req_dict), method='POST', dont_filter=True, meta={'PageType': 'ProjectBase'}) result.append(req) else: tr_arr = response.xpath('//tr[@class="TR_BG_list"]') for tr in tr_arr: projectBaseItem = ProjectBaseItem() href = 'http://www.ytfcjy.com/public/project/' + tr.xpath( 'td[2]/a/@href').extract_first() projectBaseItem['SourceUrl'] = href projectBaseItem['ProjectUUID'] = uuid.uuid3( uuid.NAMESPACE_DNS, href) projectBaseItem['ProjectName'] = tr.xpath( 'td[2]/a/u/text()').extract_first() projectBaseItem['Developer'] = tr.xpath( 'td[3]/text()').extract_first() projectBaseItem['ProjectAddress'] = tr.xpath( 'td[4]/text()').extract_first() projectBaseItem['PresalePermitNumber'] = tr.xpath( 'td[5]/text()').extract_first() projectBaseItem['SoldAmount'] = tr.xpath( 'td[6]/text()').extract_first() projectBaseItem['UnsoldAmount'] = tr.xpath( 'td[7]/text()').extract_first() result.append(projectBaseItem) projectInfo_req = Request( url=projectBaseItem['SourceUrl'], headers=self.settings.get('DEFAULT_REQUEST_HEADERS'), dont_filter=True, meta={'PageType': 'ProjectInfo'}) result.append(projectInfo_req) return result
def start_requests(self): for url in self.start_urls: yield Request(url, headers=self.headers, dont_filter=True)
def next_request(self, message: BaseRmqMessage) -> Request: return Request('https://httpstat.us/200', dont_filter=True)
def start_requests(self): for base_url in self.base_urls: yield Request(url=base_url, callback=self.parse)
def parse_first_callback(self, response, index: int): self.logger.info(f'INDEX - {index}') yield Request('https://httpstat.us/201', callback=self.parse_second_callback, dont_filter=True)
def process_spider_output(self, response, result, spider): out_come = cheack_response(pagetype=['pd_url', 'pd_url2'], response=response, result=result) outcome_list, pagetype = [], response.meta['PageType'] def clean_district(status): if status in ("河南岸街道办", "江北街道办", "江南街道办", '桥东街道办', '桥西街道办', '水口街道办', '小金口街道办', '龙丰街道办', '汝湖镇', '马安镇', '三栋镇', '横沥镇', '芦洲镇'): return '惠城区' elif status in ("惠环街道办", "陈江街道办", "潼湖镇", "潼侨镇", "沥林镇", "东江科技园", "惠南科技园"): return '仲恺区' elif status in ("澳头街道办", "西区街道办", "霞涌街道办", "大亚湾中心区"): return "大亚湾" elif status in ("罗阳镇", "石湾镇", "罗浮山", "园洲镇", "龙溪镇", "杨村镇", "泰美镇", "长宁镇", "观音阁镇", "石坝镇", "麻陂镇", "公庄镇", "湖镇镇", "横河镇", "龙华镇", "福田镇", "柏塘镇", "杨侨镇"): return "博罗县" elif status in ("平山街道", "巽寮滨海旅游度假区", "港口滨海旅游度假区", "稔山镇", "平海镇", "黄埠镇", "多祝镇", "白花镇", "安墩镇", "大岭镇", "梁化镇", "铁涌镇", "吉隆镇", "宝口镇", "高潭镇", "白盆珠镇"): return "惠东县" else: return "龙门县" if (out_come == 'right') and (pagetype == 'pd_url'): item_pd = response.meta['item'] url_next = response.xpath( '//*[@id="house"]/div[7]/dl[3]/dt/div/a/@href').extract_first( ) re_get = Request(url=url_next, method='GET', headers=self.headers, meta={ 'PageType': 'pd_url2', "item": item_pd }, dont_filter=True) outcome_list.append(re_get) return outcome_list elif (out_come == 'right') and (pagetype == 'pd_url2'): item_pd = response.meta['item'] item_pd["RealEstateProjectID"] = re.findall( r'num\=(.+)', response.url)[0] item_pd['RegionName'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[1]/td[1]/text()' ).extract_first()) item_pd['ProjectBlock'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[1]/td[2]/text()' ).extract_first()) item_pd['BuildingType'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[2]/td[1]/text()' ).extract_first()) item_pd['ProjectHouseType'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[2]/td[2]/text()' ).extract_first()) item_pd["AveragePrice"] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[3]/td[1]/text()' ).extract_first()) item_pd['ProjectMainShape'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[3]/td[2]/text()' ).extract_first()) item_pd['FloorArea'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[1]/text()' ).extract_first()) item_pd['TotalBuidlingArea'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[2]/text()' ).extract_first()) item_pd['HouseBuildingCount'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[5]/td[1]/text()' ).extract_first()) item_pd['HousingCount'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[5]/td[2]/text()' ).extract_first()) item_pd['GreeningRate'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[6]/td[2]/text()' ).extract_first()) item_pd['FloorAreaRatio'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[6]/td[1]/text()' ).extract_first()) item_pd['PropertyRightsDescription'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[7]/td[2]/text()' ).extract_first()) item_pd['ParkingSpaceAmount'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[8]/td[1]/text()' ).extract_first()) item_pd['ParkingSpaceMatching'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[8]/td[2]/text()' ).extract_first()) # item_pd['EarliestOpeningTime'] = clean_rule1(response.xpath('//*[@id="house"]/div[2]/div[5]/table/tr[8]/td[2]/text()').extract_first()) # item_pd['FloorArea'] = clean_rule1(response.xpath('//*[@id="house"]/div[3]/dl[1]/dd/table/tr[4]/td[1]/text()').extract_first()) item_pd['ProjectAddress'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[19]/td/text()'). extract_first()) item_pd['TotalBuidlingArea'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/tr[4]/td[2]/text()'). extract_first()) item_pd['ProjectLandNumber'] = clean_rule1( response.xpath( '//*[@id="house"]/div[2]/div[5]/table/tr[10]/td[2]/text()' ).extract_first()) item_pd['ManagementFees'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[9]/td[2]/text()' ).extract_first()) item_pd['ManagementCompany'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[9]/td[1]/text()' ).extract_first()) item_pd['EarliestOpeningTime'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[10]/td[1]/text()' ).extract_first()) item_pd['Project_LivingTime'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[10]/td[2]/text()' ).extract_first()) item_pd['Decoration'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[12]/td[1]/text()' ).extract_first()) item_pd['ProjectLandNumber'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[17]/td/text()'). extract_first()) item_pd['Project_Feature'] = clean_rule1( response.xpath( '//*[@id="house"]/div[3]/dl[1]/dd/table/tr[20]/td/text()'). extract_first()) item_pd['Project_Traffic'] = clean_rule2( response.xpath('//*[@id="house"]/div[3]/dl[4]/dd/p[1]/text()'). extract_first()) item_pd['Project_Introduce'] = clean_rule2( response.xpath('//*[@id="house"]/div[3]/dl[2]/dd/p/text()'). extract_first()) item_pd['Project_Surround'] = clean_rule2( response.xpath('//*[@id="house"]/div[3]/dl[3]/dd/p[1]/text()'). extract_first()) item_pd['ProjectUrl'] = response.url item_pd['DistrictName'] = clean_district(item_pd['RegionName']) outcome_list.append(item_pd) return outcome_list else: return out_come
def nextPage(self, response): #中间页解析函数 tempUrl = "https://yz.chsi.com.cn" +\ response.xpath("//div[@class='yxk-index-con']/div[2]/ul/li[last()-2]/a/@href").extract()[0] item = response.meta["item"] url = tempUrl yield Request(url, meta={"item": item}, callback=self.typePase)
'KINGFISHER_PLUCK_MAX_BYTES': 10 }, release_pointer='/date') extension = KingfisherPluck.from_crawler(spider.crawler) request = Request('http://example.com', meta={'file_name': 'test.json'}) extension.bytes_received(data=b'12345', spider=spider, request=request) assert extension.total_bytes_received == 5 assert extension.max_bytes == 10 @pytest.mark.parametrize('test_request,spider_class,attributes', [ (Request('http://example.com', callback=lambda item: item, meta={'file_name': 'test.json'}), BaseSpider, {}), (Request('http://example.com', meta={'file_name': 'test.rar' }), CompressedFileSpider, {}), (Request('http://example.com', meta={'file_name': 'test.zip' }), CompressedFileSpider, {}), (Request('http://example.com', meta={'file_name': 'test.xlsx' }), BaseSpider, { 'unflatten': True }), (Request('http://example.com', meta={'file_name': 'test.json' }), BaseSpider, { 'root_path': 'item' }), (Request('http://example.com', meta={'file_name': 'test.json' }), BaseSpider, {