def parse(self, response): bs = bs4.BeautifulSoup(response.text) # numbers = bs.find('div',attrs={'class':'page-box house-lst-page-box'}) # number = response.css('.page-box house-lst-page-box div::attr(page-data)').extract() number = response.xpath('//div[@class="page-box house-lst-page-box"]/../@page-data') # find('div', attrs={'class': 'se-link-container'}) item = BeikeItem() items_div = bs.find_all("div", attrs={"class": "content"}) # assert isinstance(items_div,Tag) t = items_div.find('li', attrs={'class': 'clear'}, recursive=True) for item_li in t: assert isinstance(item_li, Tag) details = item_li.find('div', attrs={'class': 'info clear'}) name = details.find('a', attrs={'class': "VIEWDATA CLICKDATA maidian-detail"}).text # 小区名字 type = details.find('div', attrs={'class': 'resblock-name'}).select('span')[-1].text location = details.find("div", attrs={"class": "address"}).text address = location.select('a').text prices = details.find('div', attrs={'class': 'priceInfo'}).select('span')[-1].text price = re.match(r'\d+', prices) try: area = details.find('div', attrs={'class': 'houseInfo'}).text for i in re.findall(r"\d+", area)[-1]: area = i except: area = 0 total = details.find('div', attrs={'class': 'priceInfo'}).select('span')[0].text print('-------------') item['name'] = name item['type'] = type item['address'] = address item['area'] = area item['price'] = price item['total'] = total yield item time.sleep(0.6) # 开始爬取多页 self.y += 1 if self.y <= int(number): # 可以看到贝壳房-新房,网站上只有100页 parse_url = self.data_url.format(self.start_pgs[self.x], self.y) yield Request(url=parse_url, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}, callback=self.parse, errback=self.err_callback, ) else: self.y = 1 self.x += 1 if self.x <= len(self.start_pgs) - 1: parse_url = self.data_url.format(self.start_pgs[self.x], self.y) yield Request(url=parse_url, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}, callback=self.parse, errback=self.err_callback, dont_filter=True) else: return
def parse(self, response): bs = bs4.BeautifulSoup(response.text) item = BeikeItem() items_div = bs.find('div', attrs={'class':"resblock-list-container clearfix"}) assert isinstance(items_div,Tag) for item_li in items_div.find_all('li',attrs={'class':'resblock-list post_ulog_exposure_scroll has-results'},recursive=True): assert isinstance(item_li,Tag) details = item_li.find('div',attrs={'class':'resblock-desc-wrapper'}) name = details.find('a',attrs= {'class':"name"}).text #小区名字 type = details.find('div',attrs = {'class':'resblock-name'}).select('span')[-1].text location = details.find("a", attrs={"class": "resblock-location"}).text district = location.split('/')[0] district = district.split('\n')[1] #区域 address = location.split('/')[-1] address = address.split('\n\t\t')[0] #地址 try: area = details.find('a',attrs={'class':'resblock-room'}).select('span')[-1].text area = area.split(' ')[1] except: area = '待定' price = details.find('span',attrs = {'class':'number'}).text try: total = details.find('div',attrs = {'class':'second'}).text for i in re.findall(r"\d+",total): total = i except: total = 0 print('-------------') item['name'] = name item['type'] = type item['district'] = district item['address'] = address item['area'] = area item['price'] = price item['total'] = total yield item time.sleep(0.2) # 开始爬取多页 self.start_pg += 1 if self.start_pg <= 100: # 可以看到贝壳房-新房,网站上只有100页 parse_url = self.data_url.format(self.start_pg) yield Request(url=parse_url, callback=self.parse, errback=self.err_callback) else: return
def parse_data(self, response): for each in response.xpath("//div[@class='resblock-name']"): item = BeikeItem() name = each.xpath("./a/text()").extract()[0] detailurl = each.xpath("./a/@href").extract()[0] # 表示当前节点的兄弟节点中的 第 1 个 div标签 location_area = each.xpath( "./following-sibling::div[1]/span/text()").extract()[0] location_block = each.xpath( "./following-sibling::div[1]/span/text()").extract()[1] location_road = each.xpath( "./following-sibling::div[1]/a/text()").extract()[0] location = location_area + location_block + location_road area = each.xpath( "./following-sibling::div[2]/span/text()").extract() price_list = each.xpath( "./following-sibling::div[5]/div[@class='main-price']/span/text()" ).extract() price = '' if len(price_list) > 1: price_number = price_list[0] price_unit = price_list[1] price = price_number + price_unit total = each.xpath( "./following-sibling::div[5]/div[@class='second']/text()" ).extract() print("testttttt", name, location, price, area, total, detailurl) item['name'] = name item['detailurl'] = "https://ty.fang.ke.com" + detailurl item['location'] = location item['area'] = area item['price'] = price item['total'] = total yield item
def parse(self, response): for each in response.xpath("//div[@class='resblock-name']"): item = BeikeItem() name = each.xpath("./a/text()").extract()[0] detailurl = each.xpath("./a/@href").extract()[0] # 表示当前节点的兄弟节点中的 第 1 个 div标签 location_area = each.xpath("./following-sibling::div[1]/span/text()").extract()[0] location_block = each.xpath("./following-sibling::div[1]/span/text()").extract()[1] location_road = each.xpath("./following-sibling::div[1]/a/text()").extract()[0] location = location_area + location_block + location_road area = each.xpath("./following-sibling::div[2]/span/text()").extract() price_list= each.xpath("./following-sibling::div[5]/div[@class='main-price']/span/text()").extract() price = '' if len(price_list) > 1: price_number = price_list[0] price_unit = price_list[1] price = price_number + price_unit total = each.xpath("./following-sibling::div[5]/div[@class='second']/text()").extract() item['name'] = name item['detailurl'] = "https://ty.fang.ke.com"+detailurl item['location'] = location item['area'] = area item['price'] = price item['total'] = total yield item if self.page < 22: self.page += 1 newurl = self.url + str(self.page) # 回调parse方法,接着爬下一个页面 yield scrapy.Request(newurl, callback = self.parse, dont_filter=True)
def parse(self, response): lj = response.meta["url"] daqu = response.meta["daqu"] qu = response.meta["qu"] soup = BeautifulSoup(response.text, "lxml") lis = soup.find('div', attrs={ "data-component": "list" }).find_all('li', attrs={"class": "clear"}) for li in lis: item = BeikeItem() item['href'] = li.find('div', attrs={ "class": "info clear" }).find('div', attrs={ "class": "title" }).find('a').get('href') item['totalPrice'] = li.find('div', attrs={ "class": "totalPrice" }).find('span').text item['unitPrice'] = li.find('div', attrs={ "class": "unitPrice" }).get("data-price") item['title'] = li.find('div', attrs={ "class": "info clear" }).find('div', attrs={ "class": "title" }).find('a').get('title') item['address'] = li.find('div', attrs={ "class": "positionInfo" }).find('a').text item['address_uri'] = li.find('div', attrs={ "class": "positionInfo" }).find('a').get('href') item['houseInfo'] = li.find('div', attrs={ "class": "houseInfo" }).text.replace('\n', '').replace(' ', '').split("|") item['followInfo'] = li.find('div', attrs={ "class": "followInfo" }).text.replace('\n', '').replace(' ', '') tags = [] tag_cont = li.find('div', attrs={"class": "tag"}).find_all('span') for tag in tag_cont: tags.append(tag.text) item['tags'] = tags item['lj'] = lj item['daqu'] = daqu item['qu'] = qu yield item pages = eval( soup.find('div', attrs={ "class": "page-box house-lst-page-box" }).get('page-data')) if pages['totalPage'] != pages['curPage']: yield scrapy.Request(url=lj + "/pg" + str(int(pages['curPage'] + 1)), callback=self.parse, meta={ "url": lj, "qu": qu, "daqu": daqu })
def parse_zhengzu(self, response): province, city, city_part = response.meta.get("info") divs = response.xpath( "//div[@class='content__article']//div[@class='content__list']/div" ) for div in divs: name = div.xpath(".//div/p[1]/a/text()").get() if name: name = re.sub("\s", "", name) info_list = div.xpath("./div/p[2]/text()").getall() info_list = list(map(lambda x: re.sub("\s", "", x), info_list)) area = "".join( list(filter(lambda x: x.endswith("㎡"), info_list))) rooms = "".join( list(filter(lambda x: x.endswith("卫"), info_list))) price = "".join(div.xpath("./div/span//text()").getall()) time = div.xpath("./div/p[4]/text()").get() origin_url = div.xpath("./a[1]/@href").get() origin_url = "https:" + city_part + ".zu.ke.com" + origin_url item = BeikeItem(province=province, city=city, name=name, area=area, rooms=rooms, price=price, time=time, origin_url=origin_url) yield item driver = webdriver.PhantomJS() driver.get(response.url) sleep(5) # elem = WebDriverWait(driver=driver, timeout=20).until( # EC.presence_of_element_located((By.XPATH, "//a[@class='next']")) # ) # print(elem) # driver.implicitly_wait(20) source = driver.page_source print(source) html = etree.HTML(source) print(html) next_url = html.xpath("//a[@class='next']/@href")[0] print(next_url) driver.quit() if next_url: print('*' * 100) print(next_url) yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_zhengzu, meta={'info': (province, city, city_part)}) print('*' * 100) # from scrapy.spiders import CrawlSpider, Rule # from jianshu.items import JianshuItem # # class JianshuSpiderSpider(CrawlSpider): # name = 'jianshu_spider' # allowed_domains = ['jianshu.com'] # start_urls = ['http://jianshu.com/'] # # rules = ( # Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True), # ) # # def parse_detail(self, response): # title = response.xpath("//h1[@class='title']/text()").get() # avatar = response.xpath("heng='avatar']/img/@src").get() # author = response.xpath("//span[@class='name']/a/text()").get() # pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","") # #获取文章id # url = response.url # url1 = url.split("?")[0] # article_id = url1.split("/")[-1] # #文章内容,包括标签,而不是存文本内容 # content = response.xpath("//div[@class='show-content']").get() # # word_count = response.xpath("//span[@class='wordage']/text()").get() # # comment_count = response.xpath("//span[@class='comments-count']/text()").get() # # read_count = response.xpath("//span[@class='views-count']/text()").get() # # like_count = response.xpath("//span[@class='likes-count']/text()").get() # # subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall()) # # item = JianshuItem( # title=title, # avatar=avatar, # pub_time=pub_time, # author=author, # origin_url=response.url, # content=content, # article_id=article_id, # # subjects=subjects, # # word_count=word_count, # # comment_count=comment_count, # # like_count=like_count, # # read_count=read_count # ) # yield item #selenium # from selenium import webdriver # from lxml import etree # import re # import time # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.common.by import By # # class LagouSpider(object): # def __init__(self): # self.driver = webdriver.Chrome() # #python职位 # self.url = 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=' # self.position = [] # # def run(self): # self.driver.get(self.url) # while True: # source = self.driver.page_source # WebDriverWait(driver=self.driver,timeout=20).until( # EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]")) # ) # self.parse_list_page(source) # #点“下一页” # next_btn = self.driver.find_element_by_xpath( # "//div[@class='pager_container']/span[last()]") # if "pager_next_disabled" in next_btn.get_attribute("class"): # break # else: # next_btn.click() # time.sleep(1) # # # def parse_list_page(self,source): # html = etree.HTML(source) # links = html.xpath("//a[@class='position_link']/@href") # #每一页的所有职位的详情url # for link in links: # self.request_detail_page(link) # time.sleep(1) # # def request_detail_page(self,url): # # self.driver.get(url) # self.driver.execute_script("window.open('%s')"%url) # self.driver.switch_to.window(self.driver.window_handles[1]) # # WebDriverWait(driver=self.driver,timeout=20).until( # EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']")) # ) # #获取职位详情页的源代码 # source = self.driver.page_source # self.parse_detail_page(source) # #关闭当前详情页,并且切换到列表页 # self.driver.close() # self.driver.switch_to.window(self.driver.window_handles[0]) # # def parse_detail_page(self,source): # html = etree.HTML(source) # position_name = html.xpath("//span[@class='name']/text()")[0] # job_request_spans = html.xpath("//dd[@class='job_request']//span") # salary = job_request_spans[0].xpath('.//text()')[0].strip() # city = job_request_spans[1].xpath('.//text()')[0].strip() # city = re.sub(r"[\s/]","",city) # work_years = job_request_spans[2].xpath('.//text()')[0].strip() # work_years = re.sub(r"[\s/]","",work_years) # education = job_request_spans[3].xpath('.//text()')[0].strip() # education = re.sub(r"[\s/]","",education) # desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip() # company_name = html.xpath("//h2[@class='fl']/text()")[0].strip() # position = { # 'name':position_name, # 'company_name':company_name, # 'salary':salary, # 'city': city, # 'work_years': work_years, # 'education': education, # 'desc': desc, # } # self.position.append(position) # print(position) # print('-'*200) # # if __name__ == '__main__': # spider = LagouSpider() # spider.run() #threading # import requests # from lxml import etree # from urllib import request # import os # import re # import threading # from queue import Queue # # class Producer(threading.Thread): # headers = { # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', # 'Referer': 'https://movie.douban.com/' # } # # def __init__(self, page_queue, img_queue, *args, **kwargs): # super(Producer, self).__init__(*args, **kwargs) # self.page_queue = page_queue # self.img_queue = img_queue # # def run(self): # while True: # if self.page_queue.empty(): # break # url = self.page_queue.get() # self.parse_page(url) # # def parse_page(self,url): # response = requests.get(url,headers=self.headers) # text = response.text # html = etree.HTML(text) # imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']") # for img in imgs: # # print(etree.tostring(img)) # #图片地址 # img_url = img.get('data-original') # #图片名字 # alt = img.get('alt') # #替换掉名字里面的特殊字符 # alt = re.sub(r'[\??\.,。!!\*]','',alt) # #获取图片的后缀名(.gif .jpg) # suffix = os.path.splitext(img_url)[1] # #保存的时候完整的图片名字 # filename = alt + suffix # self.img_queue.put((img_url,filename)) # # # class Consumer(threading.Thread): # def __init__(self,page_queue,img_queue,*args,**kwargs): # super(Consumer, self).__init__(*args,**kwargs) # self.page_queue = page_queue # self.img_queue = img_queue # # def run(self): # while True: # if self.img_queue.empty() and self.page_queue.empty(): # break # img_url,filename = self.img_queue.get() # request.urlretrieve(img_url, 'C:/Users/Administrator/Desktop/images/' + filename) # print("已下载完一张图片") # # # def main(): # page_queue = Queue(1000) # img_queue = Queue(10000) # # for x in range(1,1000): # url = 'http://www.doutula.com/photo/list/?page=%d'%x # page_queue.put(url) # # for x in range(10): # t = Producer(page_queue,img_queue) # t.start() # # for x in range(10): # t = Consumer(page_queue,img_queue) # t.start() # # if __name__ == '__main__': # main() #下载远程数据 #request.urlretrieve(img_url,'C:/Users/Administrator/Desktop/images/'+filename)