예제 #1
0
    def manHuan_Title(self, response):

        manHuna_Book_Name = response.meta['manHuna_Book_Name']
        manHuna_Book_Type = response.meta['manHuna_Book_Type']
        manHuna_Book_Auth = response.meta['manHuna_Book_Auth']
        manHuna_Book_profile = response.meta['manHuna_Book_profile']
        manHuna_Book_cover_url = response.meta['manHuna_Book_cover_url']

        manHuna_Book_title = response.xpath('//div[@class="wrap"]/span/text()').extract()[0]
        manHuna_Book_img_url = self.get_section_page(response.url)

        if manHuna_Book_title.strip() == '':
            print 'manHuna_Book_title is null'
            manHuna_Book_title = 'null'
        else:
            manHuna_Book_title = manHuna_Book_title

        i = MyprojectItem()

        i['manHuna_Book_Name'] = manHuna_Book_Name
        i['manHuna_Book_Type'] = manHuna_Book_Type
        i['manHuna_Book_Auth'] = manHuna_Book_Auth
        i['manHuna_Book_profile'] = manHuna_Book_profile
        i['manHuna_Book_title'] = manHuna_Book_title
        i['manHuna_Book_img_url'] = manHuna_Book_img_url
        i['manHuna_Book_cover_url'] = manHuna_Book_cover_url

        yield i
예제 #2
0
    def parse(self, response):
        job_list = response.css('.sight_item')
        item = MyprojectItem()
        for page in job_list:
            item['name'] = page.xpath(
                ".//a[@class='name']/text()").extract()[0]
            level_handler = page.xpath(
                ".//span[@class='level']/text()").extract()
            item['level'] = level_handler[0] if level_handler else ''
            item['hot'] = page.xpath(
                ".//span[@class='product_star_level']/em/span/text()").extract(
                )[0][3:]
            item['area'] = "[" + page.css('.area').xpath(
                ".//a/text()").extract()[0] + "]"
            item['address'] = page.css(".address").xpath(
                ".//span/text()").extract()[0]
            price_temp = page.css(".sight_item_price").xpath(
                ".//em/text()").extract()
            item['price'] = price_temp[0] if price_temp else 0
            item['info'] = page.css(".intro").xpath("./text()").extract()[0]
            num_handler = page.xpath(
                ".//span[@class='hot_num']/text()").extract()
            item['num'] = num_handler[0] if num_handler else 0
            yield item

        for key in position:
            print("正在爬取{}".format(key))
            # 取前10页
            for page in range(1, 14):
                print("正在爬取第{}页".format(page))
                yield scrapy.Request(url_template.format(key, page),
                                     callback=self.parse)
예제 #3
0
 def parse(self, response):
     movie_list = response.xpath(
         "//div[@class='article']//ol[@class='grid_view']/li")
     for i_item in movie_list:
         douban_item = MyprojectItem()
         # 写详细的xpath,进行数据的解析
         douban_item['serial_number'] = i_item.xpath(
             ".//div[@class='item']//em/text()").extract_first()
         douban_item['movie_name'] = i_item.xpath(
             ".//div[@class='info']/div[@class='hd']/a/span[1]/text()"
         ).extract_first()
         content = i_item.xpath(
             ".//div[@class='info']//div[@class='bd']/p[1]/text()").extract(
             )
         # 数据的处理,因为有多行文本
         for i_content in content:
             content_s = "".join(i_content.split())
             douban_item['introduce'] = content_s
         douban_item['star'] = i_item.xpath(
             ".//span[@class='rating_num']/text()").extract_first()
         douban_item['evaluate'] = i_item.xpath(
             ".//div[@class='star']//span[4]/text()").extract_first()
         douban_item['describe'] = i_item.xpath(
             ".//p[@class='quote']/span/text()").extract_first()
         # 提交到管道pipelines进行数据的清洗啊数据的存储啊
         yield douban_item
         # 解析下一页规则,取得后面页面的xpath
     next_link = response.xpath(
         "//span[@class='next']/link/@href").extract()
     if next_link:
         next_link = next_link[0]
         yield scrapy.Request("https://movie.douban.com/top250" + next_link,
                              callback=self.parse)
예제 #4
0
파일: wenyi.py 프로젝트: wutonghua/pachong
 def parse_question(self, response):
     print(
         response.xpath(
             '//div[@class="ask_cont"]/div[1]/p/text()').extract())
     print(
         response.xpath(
             '//div[@class="sele_all marg_top"]/div[1]/p/text()').extract())
     item = MyprojectItem()
     item['title'] = response.xpath(
         '//div[@class="ask_cont"]/div[1]/p/text()').extract()
     if item['title']:
         item['title'] = item['title'][0]
     else:
         item['title'] = ''
     item['dafu1'] = response.xpath(
         '//div[@class="sele_all marg_top"]/p/text()').extract()
     if item['dafu1']:
         item['dafu1'] = item['dafu1'][0]
     else:
         item['dafu1'] = ''
     item['dafu2'] = response.xpath(
         '//div[@class="sele_all"]/p/text()').extract()
     if item['dafu2']:
         item['dafu2'] = item['dafu2'][0]
     else:
         item['dafu2'] = ''
     yield item
예제 #5
0
    def parse(self, response):  # response に、ウェブサイトの情報が入っている
        item = MyprojectItem()  # items.pyで指定したクラス
        item['title'] = response.xpath(self.xpath['title']).extract()[0]
        item['link'] = response.url
        item['date'] = datetime.datetime.utcnow() + datetime.timedelta(
            hours=9)  # 現在時間。日本時間にして突っ込む。

        yield item
예제 #6
0
	def parse_detail(self, response):
		x = HtmlXPathSelector(response)
		item = MyprojectItem()
		item['link'] = response.url
		item['title'] = x.select('//*[@id="BookCon"]/h1/text()').extract()
		#item['desc'] = x.select('//*[@id="pagecontent"]/text()').extract()
		item['desc'] = x.select('//div[contains(@id, "pagecontent")]/text()').extract()
		return item
예제 #7
0
    def parse_detail(self, response):
		x = HtmlXPathSelector(response)
		item = MyprojectItem()
		item['link'] = response.url
		item['title'] = x.select('//title/text()').extract()
		#item['desc'] = x.select('//*[@id="pagecontent"]/text()').extract()
		item['desc'] = x.select('//*[@id="content"]/p').extract()
		return item
예제 #8
0
 def parse(self, response):
     #通过xpath提取图片地址
     images = response.selector.xpath(
         "//img[@class='illustration']/@src").extract()
     items = MyprojectItem()
     for i in images:
         #url写入到item中提交
         items['image_urls'] = ['http:' + i.strip()]
         yield scrapy.Request(url=items['image_urls'][0],
                              callback=self.parse_image)
예제 #9
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sites = hxs.select('//ul/li')
     items = []
     for site in sites:
         item = MyprojectItem()
         item['title'] = site.select(
             '//*[@id="BookCon"]/h1/text()').extract()
         item['link'] = site.select('a/@href').extract()
         item['desc'] = site.select(
             '//*[@id="pagecontent"]/text()').extract()
         items.append(item)
     return items
예제 #10
0
 def parse(self, response):
     jsondata = json.loads(response.text)
     result = jsondata["data"]["results"]
     for item in result:
         myitem = MyprojectItem()
         deurl = item["positionURL"]
         myitem["jobname"] = item["jobName"]
         myitem["salary"] = item["salary"]
         myitem["companyName"] = item["company"]["name"]
         # yield myitem
         yield scrapy.Request(deurl,
                              callback=self.parse22,
                              meta={"istem": myitem})
예제 #11
0
파일: zhilian.py 프로젝트: zxk1994/Project
 def parse(self, response):
     jsondata = json.loads(response.text)
     result = jsondata["data"]["results"]
     for item in result:
         myitem = MyprojectItem()  #实例化对象,需要调用items.py的MyprojectItem()
         deurl = item["positionURL"]
         myitem["jobname"] = item["jobName"]
         myitem["salary"] = item["salary"]
         myitem["companyName"] = item["company"]["name"]
         #上面的想保存到数据库,下面的继续爬取别的网页 职位描述,pipelines 处理数据库
         # yield myitem  #不这样写了,就使用下面的方法 meta  给了下面的方法 关联 myitem
         yield scrapy.Request(deurl,
                              callback=self.parse22,
                              meta={"istem": myitem})
예제 #12
0
    def parse(self, response):
        for sel in response.xpath('//article'):
            item = MyprojectItem()
            item['title'] = sel.xpath('h1/a/text()').extract()
            item['link'] = sel.xpath('h1/a/@href').extract()
            yield item

        # 下一页
        next_uri = response.xpath('//li[@class="next"]/a/@href').extract()
        if len(next_uri):
            # 修改url地址
            # 例如: http://fidding.me?page=2 => http://fidding.me/page?=2
            uri = next_uri[0]
            uri = uri[:17] + '/' + uri[17:]
            yield scrapy.Request(uri, method='GET', callback=self.parse)
예제 #13
0
    def parse(self, response):
        item = MyprojectItem()
        target = response.css("div.r-ent")

        for tag in target:
            try:
                item['title'] = tag.css("div.title a::text")[0].extract()
                item['author'] = tag.css('div.author::text')[0].extract()
                item['date'] = tag.css('div.date::text')[0].extract()
                item['push'] = tag.css('span::text')[0].extract()
                item['url'] = tag.css('div.title a::attr(href)')[0].extract()

                yield item

            except IndexError:
                pass
                continue
예제 #14
0
파일: zhilian.py 프로젝트: ksq1807/replit
 def parse(self, response):
    jsondata = json.loads(response.text)
    result = jsondata["data"]["results"]
    for item in result:
        # 初始化数据库的链接,用于保存数据
        myitem = MyprojectItem()
        # 需要继续爬取的对象
        deurl = item["positionURL"]
        # 要保存的数据
        myitem["salary"] = item["salary"]
        myitem["jobName"] = item["jobName"]
        myitem["companyName"] = item["company"]["name"]
        # 异步请求
        # 数据入库处理,yield把要做的工作交还给引擎
        yield myitem
        # 有下一级的url时,继续向下挖掘
        yield scrapy.Request(deurl, callback=self.parse22, meta={"istem": myitem})
예제 #15
0
    def parse(self, response):
        selector = response.css(".mctable1 tr td::text").getall()
        selector1 = list(
            map(lambda x: x.strip().lower().replace(',', ''), selector))

        lst1 = []
        lst2 = []
        lst3 = []
        lst4 = []
        years = []

        l = ItemLoader(item=MyprojectItem(), selector=response)

        ind = 0
        for line in selector1:
            if line == 'total share capital':
                for i in range(1, 6):
                    lst1.append(float(selector1[ind + i]))
            elif line == 'networth':
                for i in range(1, 6):
                    lst2.append(float(selector1[ind + i]))
            elif line == 'net worth':
                for i in range(1, 6):
                    lst2.append(float(selector1[ind + i]))
            elif line == 'reserves':
                for i in range(1, 6):
                    lst3.append(float(selector1[ind + i]))
            elif line == 'total debt':
                for i in range(1, 6):
                    lst4.append(float(selector1[ind + i]))
            ind += 1

        for i in range(1, 6):
            data = response.css("tr.lightbg td::text")[i].get()
            years.append(data)

        if len(lst4) == 0:
            lst4.extend(itertools.repeat(0.0, 5))

        l.add_value('cmp_name', response.css(".pcstname::text").get())
        l.add_value('tsc', lst1[::-1])
        l.add_value('net', lst2[::-1])
        l.add_value('res', lst3[::-1])
        l.add_value('debt', lst4[::-1])
        l.add_value('years', years[::-1])
        yield l.load_item()
예제 #16
0
def parse(self, response):

    #通过xpath提取内容

    contents = response.selector.xpath(
        "//div[@class='content']/span/text()").extract()

    #定义items作为数据暂存容器

    item = MyprojectItem()

    for i in contents:

        items['content'] = i.strip()

        #通过生成器yield将数据传送到pipeline进一步处理

        yield items

    self.log('A response from %s just arrived!' % response.url)
예제 #17
0
 def parse(self, response):
     myselector = response.css("div.dataList a::attr(title)").getall()
     l = ItemLoader(item=MyprojectItem(), selector=response)
     l.add_value('cmp_list', myselector)
     yield l.load_item()
예제 #18
0
 def parse(self, response):
     item = MyprojectItem()
     item['name'] = response.xpath('//title/text()').extract()
     return item
예제 #19
0
 def parse(self, response):
     item = MyprojectItem()  #MyprojectItem()を変数へ格納
     for pdf in response.css('div.body'):
         #item['link'] = pdf.css('table span::text').extract()
         item['url'] = pdf.css('table a::attr(href)').extract()  #PATH
     yield item
예제 #20
0
    def parse1(self, response):

        urls = response.css(
            'a.reference.download.internal::attr(href)').extract()
        for url in urls:
            yield ExamplesItem(file_urls=[response.urljoin(url)])
        # 记录个数
        autohomeSpider.count += 1
        print("第:", autohomeSpider.count, "个评论。")
        # print(AutohomeSpider.count)
        # 获取所有评论div //*[@id="maodian"]/div/div/div[2]/div[4]
        divs = response.xpath(
            '//*[@id="maodian"]/div/div/div[2]/div[@class="mouthcon"]')
        mcount = 0
        for div in divs:
            print("----------------------------------")
            item = MyprojectItem()
            # 车ID //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[2]/dl[1]/dd/a[1]
            item['CAR_ID'] = div.xpath(
                'div/div[1]/div[2]/dl[1]/dd/a[1]/@href')[0].extract().replace(
                    '/', '')
            print(item['CAR_ID'])
            # 车名字
            item['CAR_NAME'] = div.xpath(
                'div/div[1]/div[2]/dl[1]/dd/a[1]/text()')[0].extract()
            # 用户ID  //*[@id="maodian"]/div/div/div[2]/div[4]/div/div[1]/div[1]/div/div[1]/div[2]/p/a
            USER_ID1 = div.xpath(
                'div/div[1]/div[1]/div/div[1]/div[2]/p/a/@href')[0].extract()
            item['USER_ID'] = re.findall('\d{1,15}', USER_ID1)[0]
            item['USER_NAME'] = div.xpath(
                'div/div[1]/div[1]/div/div[1]/div[2]/p/a/text()')[0].extract(
                ).strip()
            # 购买地点 //*[@id="maodian"]/div/div/div[2]/div[4]/   div/div[1]/div[2]/dl[2]/dd
            PURCHASE_PLACE = div.xpath('div/div[1]/div[2]/dl[2]/dd')[0]
            item['PURCHASE_PLACE'] = PURCHASE_PLACE.xpath(
                'string(.)').extract()[0].strip()
            # 因为列表属性相同且数量不确定,所要加入判断
            dls = div.xpath('div/div[1]/div[2]/dl')
            # 正常的有7个
            if dls.__len__() == 7:
                # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
                item['PURCHASE_TIME'] = div.xpath(
                    'div/div[1]/div[2]/dl[4]/dd/text()')[0].extract().strip()
                # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
                CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0]
                item['CAR_PRICE'] = CAR_PRICE.xpath(
                    'string(.)').extract()[0].strip().replace('\xa0', '')
                # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
                PURCHASE_PURPOSE = div.xpath('div/div[1]/div[2]/dl[7]/dd')[0]
                item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath(
                    'string(.)').extract()[0].strip().replace(
                        '\r\n', '').replace('                                ',
                                            ';')
            #不正常的有6个,分为两种情况:缺经销商和缺油耗。
            elif dls.__len__() == 6:
                p = div.xpath('div/div[1]/div[2]/dl[5]/dt/p')
                # 如果有p标签 ,说明有油耗,没有经销商
                if p:
                    # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
                    item['PURCHASE_TIME'] = div.xpath(
                        'div/div[1]/div[2]/dl[3]/dd/text()')[0].extract(
                        ).strip()
                    # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
                    CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[4]/dd')[0]
                    item['CAR_PRICE'] = CAR_PRICE.xpath(
                        'string(.)').extract()[0].strip().replace('\xa0', '')
                    # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
                    PURCHASE_PURPOSE = div.xpath(
                        'div/div[1]/div[2]/dl[6]/dd')[0]
                    item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath(
                        'string(.)').extract()[0].strip().replace(
                            '\r\n',
                            '').replace('                                ',
                                        ';')
                # 如果没有p说明 没有油耗,有经销商
                else:
                    # 购买时间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[4]/dd
                    item['PURCHASE_TIME'] = div.xpath(
                        'div/div[1]/div[2]/dl[4]/dd/text()')[0].extract(
                        ).strip()
                    # 裸车购买价 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[5]/dd
                    CAR_PRICE = div.xpath('div/div[1]/div[2]/dl[5]/dd')[0]
                    item['CAR_PRICE'] = CAR_PRICE.xpath(
                        'string(.)').extract()[0].strip().replace('\xa0', '')
                    # 购车目的 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/dl[7]/dd
                    PURCHASE_PURPOSE = div.xpath(
                        'div/div[1]/div[2]/dl[6]/dd')[0]
                    item['PURCHASE_PURPOSE'] = PURCHASE_PURPOSE.xpath(
                        'string(.)').extract()[0].strip().replace(
                            '\r\n',
                            '').replace('                                ',
                                        ';')
            # 评分- 空间 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[1]/dl/dd/span[2]
            item['SCORE_SPACE'] = div.xpath(
                'div/div[1]/div[2]/div[1]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 动力 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[2]/dl/dd/span[2]
            item['SCORE_POWER'] = div.xpath(
                'div/div[1]/div[2]/div[2]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 操控
            item['SCORE_CONTROL'] = div.xpath(
                'div/div[1]/div[2]/div[3]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 油耗
            item['SCORE_FUEL_CONSUMPTION'] = div.xpath(
                'div/div[1]/div[2]/div[4]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 舒适性
            item['SCORE_COMFORT'] = div.xpath(
                'div/div[1]/div[2]/div[5]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 外观
            item['SCORE_EXTERIOR'] = div.xpath(
                'div/div[1]/div[2]/div[6]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 内饰
            item['SCORE_INTERIOR'] = div.xpath(
                'div/div[1]/div[2]/div[7]/dl/dd/span[2]/text()')[0].extract()
            # 评分- 性价比 //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[2]/div[8]/dl/dd/span[2]
            item['SCORE_COST_EFFECTIVE'] = div.xpath(
                'div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()')[0].extract()
            #item['SCORE_COST_EFFECTIVE'] = div.xpath('div/div[1]/div[2]/div[8]/dl/dd/span[2]/text()').extract()
            # 评论的url //*[@id="maodian"]/div/div/div[2]/div[4]/ div/div[1]/div[1]/div/div[2]/div[2]
            #url_id_pre = div.xpath('div/div[1]/div[1]/div/div[2]/div[2]/@id')[0].extract()  # 结果为 DivRelatedTopics_1565672
            # url_id_pre = div.xpath('//div[@class="allcont border-b-solid"]/a[1]/@href').extract()
            url_id_pre = div.xpath(
                '//div[@class="allcont border-b-solid"]/a[1]/@href'
            )[mcount].extract()
            # url_id_pre=div.xpath('/div/div[1]/div[2]/div[1]/div[3]/div[1]')[0].extract()
            # 截取id
            #url_id = re.findall('\d{1,20}', url_id_pre)[0]
            # 存入评论url
            item['COMMENT_URL'] = url_id_pre
            # "http://k.autohome.com.cn/FrontAPI/GetFeelingByEvalId?evalId=" + url_id
            COMMENT_URL = 'https:' + item['COMMENT_URL']
            mcount = mcount + 1
            print(item)
            yield SplashRequest(url=COMMENT_URL,
                                callback=self.parse_recommand,
                                magic_response=True,
                                args={
                                    'timeout': 8,
                                    'wait': 0.5
                                })