Пример #1
0
    def parse(self, response):
        bs = bs4.BeautifulSoup(response.text)
        # numbers = bs.find('div',attrs={'class':'page-box house-lst-page-box'})
        # number = response.css('.page-box house-lst-page-box div::attr(page-data)').extract()
        number = response.xpath('//div[@class="page-box house-lst-page-box"]/../@page-data')

        # find('div', attrs={'class': 'se-link-container'})
        item = BeikeItem()
        items_div = bs.find_all("div", attrs={"class": "content"})
        # assert isinstance(items_div,Tag)
        t = items_div.find('li', attrs={'class': 'clear'}, recursive=True)
        for item_li in t:
            assert isinstance(item_li, Tag)
            details = item_li.find('div', attrs={'class': 'info clear'})
            name = details.find('a', attrs={'class': "VIEWDATA CLICKDATA maidian-detail"}).text  # 小区名字
            type = details.find('div', attrs={'class': 'resblock-name'}).select('span')[-1].text
            location = details.find("div", attrs={"class": "address"}).text
            address = location.select('a').text
            prices = details.find('div', attrs={'class': 'priceInfo'}).select('span')[-1].text
            price = re.match(r'\d+', prices)
            try:
                area = details.find('div', attrs={'class': 'houseInfo'}).text
                for i in re.findall(r"\d+", area)[-1]:
                    area = i
            except:
                area = 0
            total = details.find('div', attrs={'class': 'priceInfo'}).select('span')[0].text
            print('-------------')
            item['name'] = name
            item['type'] = type

            item['address'] = address
            item['area'] = area
            item['price'] = price
            item['total'] = total

            yield item
        time.sleep(0.6)

        # 开始爬取多页

        self.y += 1
        if self.y <= int(number):  # 可以看到贝壳房-新房,网站上只有100页

            parse_url = self.data_url.format(self.start_pgs[self.x], self.y)
            yield Request(url=parse_url, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
                          callback=self.parse, errback=self.err_callback,
                          )

        else:
            self.y = 1
            self.x += 1
            if self.x <= len(self.start_pgs) - 1:
                parse_url = self.data_url.format(self.start_pgs[self.x], self.y)
                yield Request(url=parse_url, meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
                              callback=self.parse, errback=self.err_callback, dont_filter=True)
            else:
                return
Пример #2
0
    def parse(self, response):
        bs = bs4.BeautifulSoup(response.text)
        item = BeikeItem()
        items_div = bs.find('div', attrs={'class':"resblock-list-container clearfix"})
        assert isinstance(items_div,Tag)
        for item_li in items_div.find_all('li',attrs={'class':'resblock-list post_ulog_exposure_scroll has-results'},recursive=True):
            assert isinstance(item_li,Tag)
            details = item_li.find('div',attrs={'class':'resblock-desc-wrapper'})
            name = details.find('a',attrs= {'class':"name"}).text #小区名字
            type = details.find('div',attrs = {'class':'resblock-name'}).select('span')[-1].text
            location = details.find("a", attrs={"class": "resblock-location"}).text
            district = location.split('/')[0]
            district = district.split('\n')[1] #区域

            address = location.split('/')[-1]
            address = address.split('\n\t\t')[0]  #地址
            try:
                area = details.find('a',attrs={'class':'resblock-room'}).select('span')[-1].text
                area = area.split(' ')[1]
            except:
                area = '待定'
            price = details.find('span',attrs = {'class':'number'}).text
            try:
                total = details.find('div',attrs = {'class':'second'}).text
                for i in re.findall(r"\d+",total):
                    total = i
            except:
                total = 0
            print('-------------')
            item['name'] = name
            item['type'] = type
            item['district'] = district
            item['address'] = address
            item['area'] = area
            item['price'] = price
            item['total'] = total

            yield item
        time.sleep(0.2)

        # 开始爬取多页
        self.start_pg += 1
        if self.start_pg <= 100:  # 可以看到贝壳房-新房,网站上只有100页
            parse_url = self.data_url.format(self.start_pg)
            yield Request(url=parse_url, callback=self.parse, errback=self.err_callback)
        else:
            return
Пример #3
0
    def parse_data(self, response):
        for each in response.xpath("//div[@class='resblock-name']"):
            item = BeikeItem()

            name = each.xpath("./a/text()").extract()[0]
            detailurl = each.xpath("./a/@href").extract()[0]

            # 表示当前节点的兄弟节点中的 第 1 个 div标签
            location_area = each.xpath(
                "./following-sibling::div[1]/span/text()").extract()[0]
            location_block = each.xpath(
                "./following-sibling::div[1]/span/text()").extract()[1]
            location_road = each.xpath(
                "./following-sibling::div[1]/a/text()").extract()[0]
            location = location_area + location_block + location_road

            area = each.xpath(
                "./following-sibling::div[2]/span/text()").extract()

            price_list = each.xpath(
                "./following-sibling::div[5]/div[@class='main-price']/span/text()"
            ).extract()
            price = ''
            if len(price_list) > 1:
                price_number = price_list[0]
                price_unit = price_list[1]
                price = price_number + price_unit

            total = each.xpath(
                "./following-sibling::div[5]/div[@class='second']/text()"
            ).extract()
            print("testttttt", name, location, price, area, total, detailurl)
            item['name'] = name
            item['detailurl'] = "https://ty.fang.ke.com" + detailurl
            item['location'] = location
            item['area'] = area
            item['price'] = price
            item['total'] = total

        yield item
Пример #4
0
    def parse(self, response):
        for each in response.xpath("//div[@class='resblock-name']"):
            item = BeikeItem()

            name = each.xpath("./a/text()").extract()[0]
            detailurl = each.xpath("./a/@href").extract()[0]


            # 表示当前节点的兄弟节点中的 第 1 个 div标签
            location_area = each.xpath("./following-sibling::div[1]/span/text()").extract()[0]
            location_block = each.xpath("./following-sibling::div[1]/span/text()").extract()[1]
            location_road = each.xpath("./following-sibling::div[1]/a/text()").extract()[0]
            location = location_area + location_block + location_road

            area = each.xpath("./following-sibling::div[2]/span/text()").extract()

            price_list= each.xpath("./following-sibling::div[5]/div[@class='main-price']/span/text()").extract()
            price = ''
            if len(price_list) > 1:
                price_number = price_list[0]
                price_unit = price_list[1]
                price = price_number + price_unit

            total = each.xpath("./following-sibling::div[5]/div[@class='second']/text()").extract()

            item['name'] = name
            item['detailurl'] = "https://ty.fang.ke.com"+detailurl
            item['location'] = location
            item['area'] = area
            item['price'] = price
            item['total'] = total

            yield item

        if self.page < 22:
            self.page += 1

            newurl = self.url + str(self.page)
            # 回调parse方法,接着爬下一个页面
            yield scrapy.Request(newurl, callback = self.parse, dont_filter=True)
Пример #5
0
 def parse(self, response):
     lj = response.meta["url"]
     daqu = response.meta["daqu"]
     qu = response.meta["qu"]
     soup = BeautifulSoup(response.text, "lxml")
     lis = soup.find('div', attrs={
         "data-component": "list"
     }).find_all('li', attrs={"class": "clear"})
     for li in lis:
         item = BeikeItem()
         item['href'] = li.find('div', attrs={
             "class": "info clear"
         }).find('div', attrs={
             "class": "title"
         }).find('a').get('href')
         item['totalPrice'] = li.find('div', attrs={
             "class": "totalPrice"
         }).find('span').text
         item['unitPrice'] = li.find('div', attrs={
             "class": "unitPrice"
         }).get("data-price")
         item['title'] = li.find('div', attrs={
             "class": "info clear"
         }).find('div', attrs={
             "class": "title"
         }).find('a').get('title')
         item['address'] = li.find('div', attrs={
             "class": "positionInfo"
         }).find('a').text
         item['address_uri'] = li.find('div',
                                       attrs={
                                           "class": "positionInfo"
                                       }).find('a').get('href')
         item['houseInfo'] = li.find('div', attrs={
             "class": "houseInfo"
         }).text.replace('\n', '').replace(' ', '').split("|")
         item['followInfo'] = li.find('div', attrs={
             "class": "followInfo"
         }).text.replace('\n', '').replace(' ', '')
         tags = []
         tag_cont = li.find('div', attrs={"class": "tag"}).find_all('span')
         for tag in tag_cont:
             tags.append(tag.text)
         item['tags'] = tags
         item['lj'] = lj
         item['daqu'] = daqu
         item['qu'] = qu
         yield item
     pages = eval(
         soup.find('div', attrs={
             "class": "page-box house-lst-page-box"
         }).get('page-data'))
     if pages['totalPage'] != pages['curPage']:
         yield scrapy.Request(url=lj + "/pg" +
                              str(int(pages['curPage'] + 1)),
                              callback=self.parse,
                              meta={
                                  "url": lj,
                                  "qu": qu,
                                  "daqu": daqu
                              })
Пример #6
0
    def parse_zhengzu(self, response):
        province, city, city_part = response.meta.get("info")
        divs = response.xpath(
            "//div[@class='content__article']//div[@class='content__list']/div"
        )
        for div in divs:
            name = div.xpath(".//div/p[1]/a/text()").get()
            if name:
                name = re.sub("\s", "", name)
                info_list = div.xpath("./div/p[2]/text()").getall()
                info_list = list(map(lambda x: re.sub("\s", "", x), info_list))
                area = "".join(
                    list(filter(lambda x: x.endswith("㎡"), info_list)))
                rooms = "".join(
                    list(filter(lambda x: x.endswith("卫"), info_list)))
                price = "".join(div.xpath("./div/span//text()").getall())
                time = div.xpath("./div/p[4]/text()").get()
                origin_url = div.xpath("./a[1]/@href").get()
                origin_url = "https:" + city_part + ".zu.ke.com" + origin_url
                item = BeikeItem(province=province,
                                 city=city,
                                 name=name,
                                 area=area,
                                 rooms=rooms,
                                 price=price,
                                 time=time,
                                 origin_url=origin_url)
                yield item

        driver = webdriver.PhantomJS()
        driver.get(response.url)
        sleep(5)
        # elem = WebDriverWait(driver=driver, timeout=20).until(
        #     EC.presence_of_element_located((By.XPATH, "//a[@class='next']"))
        # )
        # print(elem)
        # driver.implicitly_wait(20)
        source = driver.page_source
        print(source)
        html = etree.HTML(source)
        print(html)
        next_url = html.xpath("//a[@class='next']/@href")[0]
        print(next_url)
        driver.quit()
        if next_url:
            print('*' * 100)
            print(next_url)
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_zhengzu,
                                 meta={'info': (province, city, city_part)})
            print('*' * 100)


# from scrapy.spiders import CrawlSpider, Rule
# from jianshu.items import JianshuItem
#
# class JianshuSpiderSpider(CrawlSpider):
#     name = 'jianshu_spider'
#     allowed_domains = ['jianshu.com']
#     start_urls = ['http://jianshu.com/']
#
#     rules = (
#         Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True),
#     )
#
#     def parse_detail(self, response):
#         title = response.xpath("//h1[@class='title']/text()").get()
#         avatar = response.xpath("heng='avatar']/img/@src").get()
#         author = response.xpath("//span[@class='name']/a/text()").get()
#         pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","")
#         #获取文章id
#         url = response.url
#         url1 = url.split("?")[0]
#         article_id = url1.split("/")[-1]
#         #文章内容,包括标签,而不是存文本内容
#         content = response.xpath("//div[@class='show-content']").get()
#         # word_count = response.xpath("//span[@class='wordage']/text()").get()
#         # comment_count = response.xpath("//span[@class='comments-count']/text()").get()
#         # read_count = response.xpath("//span[@class='views-count']/text()").get()
#         # like_count = response.xpath("//span[@class='likes-count']/text()").get()
#         # subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall())
#
#         item = JianshuItem(
#             title=title,
#             avatar=avatar,
#             pub_time=pub_time,
#             author=author,
#             origin_url=response.url,
#             content=content,
#             article_id=article_id,
#             # subjects=subjects,
#             # word_count=word_count,
#             # comment_count=comment_count,
#             # like_count=like_count,
#             # read_count=read_count
#         )
#         yield item

#selenium
# from selenium import webdriver
# from lxml import etree
# import re
# import time
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
#
# class LagouSpider(object):
#     def __init__(self):
#         self.driver = webdriver.Chrome()
#         #python职位
#         self.url = 'https://www.lagou.com/jobs/list_python?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput='
#         self.position = []
#
#     def run(self):
#         self.driver.get(self.url)
#         while True:
#             source = self.driver.page_source
#             WebDriverWait(driver=self.driver,timeout=20).until(
#                 EC.presence_of_element_located((By.XPATH,"//div[@class='pager_container']/span[last()]"))
#             )
#             self.parse_list_page(source)
#             #点“下一页”
#             next_btn = self.driver.find_element_by_xpath(
#                 "//div[@class='pager_container']/span[last()]")
#             if "pager_next_disabled" in next_btn.get_attribute("class"):
#                 break
#             else:
#                 next_btn.click()
#             time.sleep(1)
#
#
#     def parse_list_page(self,source):
#         html = etree.HTML(source)
#         links = html.xpath("//a[@class='position_link']/@href")
#         #每一页的所有职位的详情url
#         for link in links:
#             self.request_detail_page(link)
#             time.sleep(1)
#
#     def request_detail_page(self,url):
#         # self.driver.get(url)
#         self.driver.execute_script("window.open('%s')"%url)
#         self.driver.switch_to.window(self.driver.window_handles[1])
#
#         WebDriverWait(driver=self.driver,timeout=20).until(
#             EC.presence_of_element_located((By.XPATH,"//div[@class='job-name']/span[@class='name']"))
#         )
#         #获取职位详情页的源代码
#         source = self.driver.page_source
#         self.parse_detail_page(source)
#         #关闭当前详情页,并且切换到列表页
#         self.driver.close()
#         self.driver.switch_to.window(self.driver.window_handles[0])
#
#     def parse_detail_page(self,source):
#         html = etree.HTML(source)
#         position_name = html.xpath("//span[@class='name']/text()")[0]
#         job_request_spans = html.xpath("//dd[@class='job_request']//span")
#         salary = job_request_spans[0].xpath('.//text()')[0].strip()
#         city = job_request_spans[1].xpath('.//text()')[0].strip()
#         city = re.sub(r"[\s/]","",city)
#         work_years = job_request_spans[2].xpath('.//text()')[0].strip()
#         work_years = re.sub(r"[\s/]","",work_years)
#         education = job_request_spans[3].xpath('.//text()')[0].strip()
#         education = re.sub(r"[\s/]","",education)
#         desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
#         company_name = html.xpath("//h2[@class='fl']/text()")[0].strip()
#         position = {
#             'name':position_name,
#             'company_name':company_name,
#             'salary':salary,
#             'city': city,
#             'work_years': work_years,
#             'education': education,
#             'desc': desc,
#         }
#         self.position.append(position)
#         print(position)
#         print('-'*200)
#
# if __name__ == '__main__':
#     spider = LagouSpider()
#     spider.run()

#threading
# import requests
# from lxml import etree
# from urllib import request
# import os
# import re
# import threading
# from queue import Queue
#
# class Producer(threading.Thread):
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
#         'Referer': 'https://movie.douban.com/'
#     }
#
#     def __init__(self, page_queue, img_queue, *args, **kwargs):
#         super(Producer, self).__init__(*args, **kwargs)
#         self.page_queue = page_queue
#         self.img_queue = img_queue
#
#     def run(self):
#         while True:
#             if self.page_queue.empty():
#                 break
#             url = self.page_queue.get()
#             self.parse_page(url)
#
#     def parse_page(self,url):
#         response = requests.get(url,headers=self.headers)
#         text = response.text
#         html = etree.HTML(text)
#         imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
#         for img in imgs:
#             # print(etree.tostring(img))
#             #图片地址
#             img_url = img.get('data-original')
#             #图片名字
#             alt = img.get('alt')
#             #替换掉名字里面的特殊字符
#             alt = re.sub(r'[\??\.,。!!\*]','',alt)
#             #获取图片的后缀名(.gif .jpg)
#             suffix = os.path.splitext(img_url)[1]
#             #保存的时候完整的图片名字
#             filename = alt + suffix
#             self.img_queue.put((img_url,filename))
#
#
# class Consumer(threading.Thread):
#     def __init__(self,page_queue,img_queue,*args,**kwargs):
#         super(Consumer, self).__init__(*args,**kwargs)
#         self.page_queue = page_queue
#         self.img_queue = img_queue
#
#     def run(self):
#         while True:
#             if self.img_queue.empty() and self.page_queue.empty():
#                 break
#             img_url,filename = self.img_queue.get()
#             request.urlretrieve(img_url, 'C:/Users/Administrator/Desktop/images/' + filename)
#             print("已下载完一张图片")
#
#
# def main():
#     page_queue = Queue(1000)
#     img_queue = Queue(10000)
#
#     for x in range(1,1000):
#         url = 'http://www.doutula.com/photo/list/?page=%d'%x
#         page_queue.put(url)
#
#     for x in range(10):
#         t = Producer(page_queue,img_queue)
#         t.start()
#
#     for x in range(10):
#         t = Consumer(page_queue,img_queue)
#         t.start()
#
# if __name__ == '__main__':
#     main()

#下载远程数据
#request.urlretrieve(img_url,'C:/Users/Administrator/Desktop/images/'+filename)