def download_picture(pic_url):
    try:
        if not os.path.exists('img_picture'):
            os.mkdir('img_picture')
        picture = requests.get(pic_url, headers=heders, stream=True)
        filename = (
            'E:/pythonwork/spiderworks/scraping/articlespider/articlespider/picture_spider/img_picture/'
            + get_md5(pic_url) + '.jpg')
        pass
        with open(filename, 'wb') as f:
            f.write(picture.content)
            f.close()
    except:
        print('下载图片出错!', get_md5(pic_url))
示例#2
0
 def parse_detail(self, response):
     """
     爬取新闻详情页
     :param response:
     :return:
     """
     item = hangjian_Item()
     title = response.css(".zixun h1::text").extract_first("")
     create_date = response.xpath(
         '/html/body/div/div[2]/div[1]/div[2]/div[1]/div[1]/em/text()'
     ).extract_first("")
     author = response.xpath(
         '/html/body/div/div[2]/div[1]/div[2]/div[1]/div[6]/span/em'
     ).extract_first("")
     from_web = response.xpath(
         '/ html/body/div/div[2]/div[1]/div[2]/div[1]/div[5]/span/em/text()'
     ).extract_first("")
     content = response.css(".zixun").extract_first("")
     url = response.url
     crawl_time = datetime.datetime.now()
     url_object_id = get_md5(response.url)
     item['title'] = title
     item['create_date'] = create_date
     item['author'] = author
     item['from_web'] = from_web
     item['content'] = content
     item['url'] = url
     item['crawl_time'] = crawl_time
     item['url_object_id'] = url_object_id
     yield item
示例#3
0
 def parse_job(self, response):
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     # i['name'] = response.xpath('//div[@id="name"]').extract()
     # i['description'] = response.xpath('//div[@id="description"]').extract()
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("salary", ".job_request p span.salary::text")
     item_loader.add_xpath("job_city",
                           "//dd[@class='job_request']/p/span[2]/text()")
     item_loader.add_xpath("work_years",
                           "//dd[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//dd[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//dd[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("publish_time",
                         ".job_request p.publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div p")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("tags", ".position-label.clearfix li::text")
     item_loader.add_css("company_name", ".job_company dt a img::attr(alt)")
     item_loader.add_css("company_url", ".job_company dt a::attr(href)")
     item_loader.add_value("crawl_time", datetime.datetime.now())
     # item_loader.add_css("crawl_update_time",".work_addr")
     lagou_item = item_loader.load_item()
     return lagou_item
示例#4
0
    def get_detail_use_item_loader(self, response):
        '''
            使用item_loader,这里得到的字段是列表
        :return: 
        '''
        article_item = JobBoleArticleItem()
        item_loader = ArticleItemLoader(item=article_item, response=response)

        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))

        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader.add_value("front_image_url", [front_image_url])

        item_loader.add_xpath("title",
                              "//div[@class = 'entry-header']/h1/text()")  # 标题
        item_loader.add_xpath("create_date",
                              "//div[@class='entry-meta']/p/text()")
        item_loader.add_xpath("praise_nums",
                              "//div[@class='post-adds']//h10/text()")  # 点赞数
        item_loader.add_xpath(
            "fav_nums", "//div[@class='post-adds']/span[2]/text()")  # 收藏数
        item_loader.add_xpath(
            "comment_nums",
            "//span[@class='btn-bluet-bigger href-style hide-on-480']/text()"
        )  # 评论数
        item_loader.add_xpath("content", "//div[@class='entry']")  # 内容
        item_loader.add_xpath(
            "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()")  # 内容

        article_item = item_loader.load_item()

        yield article_item
示例#5
0
    def parse_job(self, response):
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css('title', '.job-name .name::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_md5', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_xpath('job_city',
                              '//dd[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath('work_years',
                              '//dd[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath('degree_need',
                              '//dd[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath('job_type',
                              '//dd[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        # job_desc = response.css('.job_bt')[0].xpath('string()').extract_first()  # 不用ItemLoader可以这么写
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '.job_company dt a img::attr(alt)')
        item_loader.add_css('company_url', '.job_company dt a::attr(href)')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_value('crawl_time', datetime.datetime.now())

        job_item = item_loader.load_item()

        return job_item
示例#6
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)

        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())
        item_job = item_loader.load_item()

        return item_job
示例#7
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)

        item_loader.add_css('title', '.job-name::attr("title")')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_css(
            'job_city',
            '.job_request > p:nth-child(1) > span:nth-child(2)::text')
        item_loader.add_css(
            'work_years',
            '.job_request > p:nth-child(1) > span:nth-child(3)::text')
        item_loader.add_css(
            'degree_need',
            '.job_request > p:nth-child(1) > span:nth-child(4)::text')
        item_loader.add_css(
            'job_type',
            '.job_request > p:nth-child(1) > span:nth-child(5)::text')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('tags', '.position-label .labels::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '.b2::attr("alt")')
        item_loader.add_css('company_url', '#job_company dt a::attr("href")')
        item_loader.add_value('crawl_time',
                              datetime.now().strftime(SQL_DATETIME_FORMAT))
        job_item = item_loader.load_item()
        print('parse job 函数返回:', job_item)
        return job_item
示例#8
0
 def parse_job(self, response):
     #解析猎聘网职位
     item_loader = LiepinJobItemLoader(item=LiepinJobItem(),
                                       response=response)
     item_loader.add_css("title", ".title-info h1::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     salary = response.css(".job-item-title::text").extract()[0].strip()
     item_loader.add_value("salary", salary)
     item_loader.add_css("job_addr", ".basic-infor span a::text")
     publish_time = response.css(
         ".basic-infor span::text").extract()[4].strip()
     item_loader.add_value("publish_time", publish_time)
     degree_need = response.css(
         ".job-qualifications span::text").extract()[0]
     item_loader.add_value("degree_need", degree_need)
     work_years = response.css(
         ".job-qualifications span::text").extract()[1]
     item_loader.add_value("work_years", work_years)
     tags = response.css(".tag-list span::text").extract()
     tag = ",".join(tags)
     item_loader.add_value("tags", tag)
     item_loader.add_css("company_name", ".title-info h3 a::text")
     item_loader.add_css("company_url", ".word::attr(href)")
     job_descs = response.css(".content.content-word::text").extract()
     job_desc = "".join(job_descs)
     item_loader.add_value("job_desc", job_desc)
     item_loader.add_value("crawl_time", datetime.now())
     job_item = item_loader.load_item()
     return job_item
示例#9
0
    def parse_detail(self, response):
        if response.url == 'http://www.xunyingwang.com/movie/':
            print(
                "------------------url为www.xunyingwang.com/movie------------------"
            )
            return None
        movie_item = XiaojianrenItemLoader()  # 实例化
        # 通过item loader加载item
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        tags = response.meta.get("tags", "")
        item_loader = XiaojianrenItemLoader(
            item=xunyingItem(),
            response=response)  # 默认ItemLoader是一个list,自定义TakeFirst()
        item_loader.add_css("title", ".movie-info h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_xpath(
            "create_date",
            '/html/body/div[2]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[7]/td[2]'
        )
        item_loader.add_value("front_image_url", front_image_url)
        item_loader.add_value("tags", ','.join(str(n) for n in tags))
        item_loader.add_xpath(
            "duration",
            '/html/body/div[2]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()'
        )
        item_loader.add_css("score", '.score::text')
        item_loader.add_xpath(
            "description",
            '/html/body/div[2]/div/div/div[1]/div[2]/div[2]/p/text()')

        movie_item = item_loader.load_item()

        yield movie_item  # 将传到piplines中
示例#10
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        # 以下四个item通过span拿到,用xpath比较好写
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        # 这里把全文html提取
        item_loader.add_css("job_desc", ".job_bt div")
        # 这里有些地址放在<a>下面,不能直接取text。先全拿到,后面再处理
        item_loader.add_css("job_addr", ".work_addr")
        # 注意:job_company是一个id,所以用"#"不用"."
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        # TODO: 这里不是yield?
        return job_item
示例#11
0
    def parse_detail(self, response):
        youchong_item = PetItem()
        # 提取宠物名称
        kind = response.css(".basic").xpath("h1/text()").extract()[0]

        # 不容易筛选,暂时提取取全部,之后使用正则匹配出具体的值
        base_info = response.css(".basic").extract()[0]
        # name_en = re.match(".*英文名.*", base_info)

        # 提取宠物简介
        intro1 = response.css(".j-pedia").xpath("div[1]").extract()[0]
        intro2 = response.css(".j-pedia").xpath("div[2]").extract()[0]
        intro3 = response.css(".j-pedia").xpath("div[3]").extract()[0]
        intro4 = response.css(".j-pedia").xpath("div[4]").extract()[0]
        introall = intro1+intro2+intro3+intro4
        # 提取宠物的图片
        image_url = response.css(".pet-desc-l img").xpath("@src").extract()[0]

        # 为youchong_item传递值
        youchong_item["url_object_id"] = get_md5(response.url)
        youchong_item["kind"] = kind
        youchong_item["url"] = response.url
        youchong_item["base_info"] = base_info
        youchong_item["intro"] = introall
        # 此处要写成数组的格式,因为传递到pipline的时候需要传递一个数组
        youchong_item["image_url"] = [image_url]

        # 将得到的youchong_item传递到pipelines 中去,模版已经自动生成了pipeline的配置文件,需要在settings中将“ITEM_PIPELINES”参数配置打开
        yield youchong_item

        pass



        # 调试阶段可以在虚拟环境中先制定某个页面,打开scrapy shell url('爬取动作网址')下载好单个网址,检测爬取动作的正确性
示例#12
0
    def parse_job(self, response):
        """
        解析职位信息页面
        """
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", '.job-name::attr(title)')
        # 这里最小最大薪资和最短最长工作年限等到item里再去处理
        item_loader.add_css("salary", '.job_request .salary::text')
        item_loader.add_xpath("job_city",
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath("work_years",
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath("degree_need",
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath("job_type",
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", '.publish_time::text')
        item_loader.add_css("job_advantage", '.job-advantage p::text')
        item_loader.add_css("job_desc", '.job_bt div')
        # 地点的提取处理放在item里完成。
        item_loader.add_css("job_address", '.work_addr')
        item_loader.add_css("company_name", '.job_company dt a img::attr(alt)')
        item_loader.add_css("company_url", '.job_company dt a::attr(href)')
        item_loader.add_value("crawl_time", datetime.now())
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("url", response.url)

        job_item = item_loader.load_item()

        return job_item
示例#13
0
    def parse_job(self, response):
        """
        提取数据,填充到itemloader
        :param response: 
        :return: 
        """
        # 实例化一个 ItemLoader 对象, 注意,下面是传递的item对象,不是类
        job_itemloader = LagoujobItemLoader(item=LagoujobItem(),
                                            response=response)

        # 提取数据
        job_itemloader.add_value('url', response.url)
        job_itemloader.add_value('url_object_id', get_md5(response.url))
        job_itemloader.add_css('title', '.job-name::attr(title)')
        job_itemloader.add_css('salary_min', '.job_request .salary::text')
        job_itemloader.add_css('salary_max', '.job_request .salary::text')
        job_itemloader.add_css('work_years_min', '.job_request span::text')
        job_itemloader.add_css('work_years_max', '.job_request span::text')
        job_itemloader.add_css('job_city', '.job_request span::text')
        job_itemloader.add_css('job_type', '.job_request span::text')
        job_itemloader.add_css('degree_need', '.job_request span::text')
        job_itemloader.add_css('publish_time', '.publish_time::text')
        job_itemloader.add_css('tags', '.position-label li::text')
        job_itemloader.add_css('job_advantage', '.job-advantage p::text')
        job_itemloader.add_css(
            'job_descript', '.job_bt div')  # 这里保存内容时,最好连着html一起保存, 后面查询的时候好用
        job_itemloader.add_css('job_address', '.work_addr a::text')
        job_itemloader.add_css('company_name', '#job_company a img::attr(alt)')
        job_itemloader.add_css('company_url', '#job_company a::attr(href)')
        job_itemloader.add_value('crawl_time', datetime.now())

        job_itemloader.load_item()

        return job_itemloader
示例#14
0
    def parse_detail(self, response):
        item_loader = QianchengJobItemLoader(item=QianchengJobItem(),
                                             response=response)
        item_loader.add_css('title', '.in .cn h1::attr(title)')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.in .cn strong::text')
        detail = response.css('.in .cn .msg.ltype::text').extract()
        if len(detail) < 5:
            item_loader.add_value('job_city', detail[0])
            item_loader.add_value('work_years', detail[1])
            item_loader.add_value('degree_need', '无学历要求')
            item_loader.add_value('people_need', detail[2])
            item_loader.add_value('publish_time', detail[3])
        else:
            item_loader.add_value('job_city', detail[0])
            item_loader.add_value('work_years', detail[1])
            item_loader.add_value('degree_need', detail[2])
            item_loader.add_value('people_need', detail[3])
            item_loader.add_value('publish_time', detail[4])
        item_loader.add_css('job_advantage', '.in .cn .jtag .t1 span::text')
        item_loader.add_css('job_desc', '.bmsg.job_msg.inbox')
        item_loader.add_xpath('job_addr',
                              '//div[@class="bmsg inbox"]/p[1]/text()')
        item_loader.add_css('company_name', '.com_msg .com_name p::text')
        item_loader.add_css('company_url', '.com_msg .com_name::attr(href)')
        item_loader.add_value('crawl_time', datetime.now())
        qiancheng_item = item_loader.load_item()

        return qiancheng_item
示例#15
0
    def parse_job(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()

        item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        # item_loader.add_css("job_city", ".job_request span:nth-child(2)::text")
        # item_loader.add_css("work_years", ".job_request span:nth-child(3)::text")
        # item_loader.add_css("degree_need", ".job_request span:nth-child(4)::text")
        # item_loader.add_css("job_type", ".job_request span:nth-child(5)::text")
        # item_loader.add_css("tags", ".position-label li::text")
        # item_loader.add_css("publish_time", ".publish_time::text")
        # item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job-detail")
        item_loader.add_css("job_addr", ".work_addr")
        # item_loader.add_css("company_name", "#job_company img::attr(alt)")
        # item_loader.add_xpath("company_url", "//*[@id='job_company']/dt/a/@href")
        # item_loader.add_value("crawl_time", datetime.now())
        job_item = item_loader.load_item()

        return job_item
示例#16
0
    def parse_detail(self, response):
        """
                企業名 name
                ポジション  position
                年収  income
                画像 images
                特徴 content
                url
                url_object_id(プレミアムキー代わり)

                :return:
                """

        article_item = PaizaArticleItem()

        title = response.xpath("//h2[@class='ttl mt0 mb0']/text()").get()
        position = response.xpath("//td[@class='font16']/strong/text()").get()
        income = response.xpath(
            "//div[@class='strong font18 color_blue']/text()").get()
        images = response.meta.get('front_img_url', '')
        content = response.xpath(
            "//div[@class='rBox font13 lineHeight17']/p/text()").getall()
        content = ''.join(content)

        article_item['url_object_id'] = get_md5(response.url)
        article_item['url'] = response.url
        article_item['name'] = title
        article_item['position'] = position
        article_item['income'] = income
        article_item['images'] = [images]
        article_item['content'] = content
        article_item['create_date'] = datetime.now().date()
        yield article_item
示例#17
0
    def parse_detail(self,response):
        item_loader = XmrcItemLoader(item=XmrcItem(), response=response)
        item_loader.add_value('zhuanye',response.meta.get('zhuanye'))
        item_loader.add_value('job_type',response.meta.get('job_type'))
        item_loader.add_value('object_id',get_md5(response.url))
        item_loader.add_value('link',response.url)
        item_loader.add_value('addr','厦门')
        item_loader.add_value('select_time',time.strftime(settings.TIME_SELECT_FORMAT))
        item_loader.add_value('crawl_name',self.name)
        item_loader.add_value('crawl_time',time.strftime(settings.SQL_DATE_FORMAT))
        item_loader.add_value('ident',settings.IDENT)
        item_loader.add_value('company_type','')
        item_loader.add_value('company_size','')
        item_loader.add_value('company_industry','')
        item_loader.add_xpath('title','//tr[1]/td/font[1]/a/u/text()')
        item_loader.add_xpath('company_name','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr[2]/td[2]/table/tr/td[contains(text(),"招聘单位")]/text()')
        item_loader.add_xpath('salarys','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()')
        item_loader.add_xpath('experience','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"招聘对象")]/text()')
        item_loader.add_xpath('education','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"学历要求")]/text()')
        item_loader.add_xpath('job_nums','//tr[1]/td/font[1]/font/b/text()')
        item_loader.add_xpath('job_desc','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr[last()-1]/td[2]/text()')
        item_loader.add_xpath('company_addr','//*[@id="ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr2"]/td[2]/text()')
        item_loader.add_xpath('phone',"//tr[@id='ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr1']/following-sibling::*[1]/td[2]/text()")
        item_loader.add_xpath('contact','//*[@id="ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr1"]/td[2]/text()')
        item_loader.add_xpath('release_time','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr/td[contains(text(),"招聘期限")]/text()')
        item_loader.add_xpath('max_salary','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()')
        item_loader.add_xpath('min_salary','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()')

        yield item_loader.load_item()
示例#18
0
    def parse_detail(self, response):
        item_loader = ShixisengJobItemLoader(item=ShixisengJobItem(),
                                             response=response)
        item_loader.add_css('title', '.job-header .new_job_name span::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_money.cutom_font::text')
        item_loader.add_css('job_city', '.job_msg .job_position::attr(title)')
        item_loader.add_css('work_days', '.job_msg .job_week.cutom_font::text')
        item_loader.add_css('degree_need', '.job_msg .job_academic::text')
        item_loader.add_xpath('shixi_needed',
                              '//div[@class="job_msg"]/span[5]/text()')
        item_loader.add_css('publish_time', '.job_date .cutom_font::text')
        item_loader.add_css('job_advantage', '.job_good_list span::text')
        item_loader.add_xpath('job_desc',
                              '//div[@class="content_left"]/div[1]')
        item_loader.add_css('job_addr',
                            '.con-job.job_city .com_position::text')
        item_loader.add_css('company_name', '.com-name::text')
        company_post_url = response.css(
            '.com-name::attr(href)').extract_first()
        item_loader.add_value('company_url',
                              'www.shixiseng.com{}'.format(company_post_url))
        item_loader.add_value('crawl_time', datetime.now())
        shixiseng_item = item_loader.load_item()

        return shixiseng_item
    def parse_job(self, response):
        """ 解析拉钩网的职位 """
        item_loader = LaGouItemLoad(item=LaGouItem(), response=response)
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('title', '.job-name::attr(title)')
        item_loader.add_css('salary', '.salary::text')
        item_loader.add_xpath('job_city',
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath('work_years',
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath('degree_need',
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath('job_type',
                              "//*[@class='job_request']/p/span[5]/text()")
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('tags', ".position-label.clearfix li::text")
        item_loader.add_css('job_advantage', ".job-advantage p::text")
        item_loader.add_css('job_desc', ".job_bt div")
        item_loader.add_css('job_addr', ".work_addr")
        item_loader.add_css('company_url', "#job_company dt a img::attr(src)")
        item_loader.add_css('company_name', "#job_company dt a img::attr(alt)")
        item_loader.add_value('crawl_time', datetime.now())

        job_item = item_loader.load_item()

        return job_item
示例#20
0
    def parse_item(self, response):
        """解析拉勾网的职位"""
        loader = LagouLoader(item=LagouItem(), response=response)
        loader.add_xpath('title', '//div[@class="job-name"]/@title')
        loader.add_value('url', response.url)
        loader.add_value('url_object_id', get_md5(response.url))
        loader.add_xpath(
            'salary',
            '//dd[@class="job_request"]//span[@class="salary"]/text()')
        loader.add_xpath('job_city',
                         '//dd[@class="job_request"]//span[2]/text()')
        loader.add_xpath('work_year',
                         '//dd[@class="job_request"]//span[3]/text()')
        loader.add_xpath('degree_need',
                         '//dd[@class="job_request"]//span[4]/text()')
        loader.add_xpath('job_type',
                         '//dd[@class="job_request"]//span[5]/text()')
        loader.add_xpath('publish_time', '//p[@class="publish_time"]/text()')
        loader.add_xpath('tags',
                         '//ul[contains(@class, "position-label")]/li/text()')
        loader.add_xpath('job_advantage',
                         '//dd[@class="job-advantage"]/p/text()')
        loader.add_xpath('job_desc', '//dd[@class="job_bt"]/div')
        loader.add_xpath('job_addr', '//div[@class="work_addr"]')
        loader.add_xpath('company_name',
                         '//dl[@id="job_company"]/dt/a/img/@alt')
        loader.add_xpath('company_url', '//dl[@id="job_company"]/dt/a/@href')
        loader.add_value('crawl_time', datetime.now())

        job_item = loader.load_item()

        return job_item
示例#21
0
    def parse_detail(self, response):

        #article_item=JobBoleArticleItem()

        # article_item['title']=response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # #
        # # # 提取meta中的值,使用get方法遇到空的键值才不会报错,默认值为空,此处使用的是元祖非[]
        # # # image的url要改为数组,不然在使用自动下载器会报错,即setting中的IMAGES_URLS_FILELD
        # article_item['front_image_url']=[response.meta.get('front_image_url','')]
        # date_time=re.match('.*?(\d{4}/\d+/\d+).*',response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]').extract()[0])
        # try:
        #     article_item['create_date']=datetime.datetime.strptime(date_time,'%Y/%m/%d').date()
        # except Exception as e:
        #     article_item['create_date']=datetime.datetime.now().date()
        # article_item['tag']=','.join(response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract())
        # article_item['content']=''.join(response.xpath('//div[@class="entry"]/p/text()').extract())
        # article_item['praise_nums']=response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0]
        # fav_num=response.xpath('//div[@class="post-adds"]/span[2]/text()').extract()[0]
        # match_re=re.match(".*?(\d+).*",fav_num)
        # if match_re:
        #     article_item['fav_nums']=match_re.group(1)
        # else:
        #     article_item['fav_nums']=0
        # comment_num=response.xpath('//div[@class="post-adds"]/a/span/text()').extract()[0]
        # match_re = re.match(".*?(\d+).*", comment_num)
        # if match_re:
        #     article_item['comment_nums'] = match_re.group(1)
        # else:
        #     article_item['comment_nums']=0
        # article_item['url_object_id'] =common.get_md5(response.url)

        # 使用ItemLoader加载item
        #item_loader=ItemLoader(item=JobBoleArticleItem(),response=response)
        # 使用自定义ArticleItemLoader
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(),
                                        response=response)
        item_loader.add_xpath('title',
                              '//div[@class="entry-header"]/h1/text()')
        item_loader.add_xpath(
            'tag', '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()')
        item_loader.add_value('front_image_url',
                              [response.meta.get('front_image_url', '')])
        item_loader.add_value('url', response.url)
        item_loader.add_xpath('content', '//div[@class="entry"]/p/text()')
        item_loader.add_xpath('praise_nums',
                              '//div[@class="post-adds"]/span/h10/text()')
        item_loader.add_xpath('comment_nums',
                              '//div[@class="post-adds"]/a/span/text()')
        item_loader.add_xpath('fav_nums',
                              '//div[@class="post-adds"]/span[2]/text()')
        item_loader.add_xpath(
            'create_date',
            '//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]')
        item_loader.add_value('url_object_id', common.get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
        pass
示例#22
0
    def parse_detail(self, response):
        article_item = JobboleArticleItem()

        # title = response.css('div.entry-header > h1::text').extract_first()
        # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
        # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0)
        front_img_url = response.meta.get('front_img_url', '')
        #
        # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first()
        # fav_num_re = re.match(".*(\d+).*", fav_num_info)
        # if fav_num_re:
        #     fav_num = fav_num_re.group(1)
        # else:
        #     fav_num = 0
        # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first()
        # comment_num_re = re.findall("\d+",comment_num_info)
        # if comment_num_re:
        #     comment_num = comment_num_re[0]
        # else:
        #     comment_num = 0
        #
        # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract()
        # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')])
        # content = response.css('.entry').extract_first()
        #
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['url'] = response.url
        # article_item['title'] = title
        # try:
        #     create_date = datetime.strptime(create_date,'%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now()
        # article_item['create_date'] = create_date
        # article_item['praise_num'] = praise_num
        # article_item['fav_num'] = fav_num
        # article_item['comment_num'] = comment_num
        # article_item['front_img_url'] = [front_img_url]
        # article_item['tags'] = tags
        # article_item['content'] = content

        #通过item loader价值item
        item_loader = ArticleItemLoader(item=JobboleArticleItem(),
                                        response=response)
        item_loader.add_css('title', 'div.entry-header > h1::text')
        item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text')
        item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text')
        item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text')  #re
        item_loader.add_css('comment_num',
                            'a[href="#article-comment"] span::text')  #re
        item_loader.add_css(
            'tag', '.entry-meta .entry-meta-hide-on-mobile a::text')  #处理函数
        item_loader.add_css('content', '.entry')
        item_loader.add_value('front_img_url', [front_img_url])
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))

        article_item = item_loader.load_item()

        yield article_item
示例#23
0
    def parse_detail(self, response):

        item_loader = FundsciencenetItemLoader(item=FundsciencenetItem(),
                                               response=response)

        item_loader.add_css('title', '.v_con h1::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        content_vcon = response.xpath('//*[@class="v_con"]/table/tr/td')
        item_loader.add_value(
            'approval_number',
            content_vcon[0].xpath('text()').extract_first(''))
        item_loader.add_value(
            'subject_classification',
            content_vcon[1].xpath('text()').extract_first(''))
        item_loader.add_value(
            'project_leader',
            content_vcon[2].xpath('text()').extract_first(''))
        item_loader.add_value(
            'title_of_leader',
            content_vcon[3].xpath('text()').extract_first('NA'))
        item_loader.add_value(
            'dependent_unit',
            content_vcon[4].xpath('text()').extract_first(''))
        item_loader.add_value(
            'subsidized_amount',
            content_vcon[5].xpath('text()').extract_first(''))
        item_loader.add_value(
            'project_category',
            content_vcon[6].xpath('text()').extract_first('NA'))
        item_loader.add_value(
            'time_start',
            response.xpath('//*[@class="v_con"]/table/tr[3]/td[3]/text()[1]').
            extract_first(''))
        item_loader.add_value(
            'time_end',
            response.xpath('//*[@class="v_con"]/table/tr[3]/td[3]/text()[2]').
            extract_first(''))
        item_loader.add_value(
            'chinese_keywords',
            content_vcon[8].xpath('text()').extract_first('NA'))
        item_loader.add_value(
            'english_keywords',
            content_vcon[9].xpath('text()').extract_first('NA'))

        content_usual = response.xpath('//*[@class="usual"]/div/table/tr/td')
        item_loader.add_value(
            'chinese_abstract',
            content_usual[0].xpath('text()').extract_first('NA'))
        item_loader.add_value(
            'english_abstract',
            content_usual[1].xpath('text()').extract_first('NA'))
        item_loader.add_value(
            'summary_abstract',
            content_usual[2].xpath('text()').extract_first('NA'))
        fundsciencenet_item = item_loader.load_item()
        yield fundsciencenet_item
示例#24
0
    def parse_content(self, response):
        # 通过css选择器提取数据
        # front_image_url = response.meta.get("front_image_url", "") #文章封面图
        # title = response.css('.entry-header h1::text').extract_first()
        # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip()
        # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数
        # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数
        # match_re = re.match(".*?(\d+).*", fav_num)
        # if match_re:
        #     fav_num = int(match_re.group(1))
        # else:
        #     fav_num = 0
        # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数
        # match_re = re.match(".*?(\d+).*", comments_num)  # 正则获取字符串中的数字
        # if match_re:
        #     comments_num = int(match_re.group(1))
        # else:
        #     comments_num = 0
        # content = response.css('div.entry').extract_first() # 正文
        # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract()
        # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')]
        # tags = ",".join(tag_list)  # 标签
        #
        # article_item = JobboleArticleItem()
        # article_item["title"] = title
        # try:
        #     create_date = datetime.strptime(create_date, '%Y/%m/%d').date()
        # except Exception as e:
        #     create_date = datetime.now().date()
        # article_item["create_date"] = create_date
        # article_item["url"] = response.url
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_num
        # article_item["comment_nums"] = comments_num
        # article_item["fav_nums"] = fav_num
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader加载item  使用自定义的loader:ArticleItemLoader 由list变成str
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()

        yield article_item
示例#25
0
    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        # 提取文章的具体字段
        front_image_url = response.meta.get("front_image_url", "")
        # title = response.css(".entry-header h1::text").extract()[0]
        # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace('·', '').strip()
        # praise_nums = response.css(".vote-post-up h10::text").extract()[0]
        # fav_nums = response.css(".bookmark-btn::text").extract()[0]
        # match_re1 = re.match(".*?(\d+).*", fav_nums)
        # if match_re1:
        #     fav_nums = int(match_re1.group(1))
        # else:
        #     fav_nums = 0
        #
        # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
        # match_re2 = re.match(".*?(\d+).*", comment_nums)
        # if match_re2:
        #     comment_nums = int(match_re2.group(1))
        # else:
        #     comment_nums = 0
        # content = response.css("div.entry").extract()[0]
        # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
        # tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
        # tags = ",".join(tag_list)
        #
        #
        # article_item["url_object_id"] = get_md5(response.url)
        # article_item["title"] = title
        # article_item["url"] = response.url
        #
        # article_item["create_date"] = create_date
        # article_item["front_image_url"] = [front_image_url]
        # article_item["praise_nums"] = praise_nums
        # article_item["comment_nums"] = comment_nums
        # article_item["fav_nums"] = fav_nums
        # article_item["tags"] = tags
        # article_item["content"] = content

        # 通过item loader 加载item
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        # item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()


        yield article_item
示例#26
0
    def parse_job(self, response):
        item_loader = JobItemLoader(item=JobItem(), response=response)
        item_loader.add_css("title", ".job-title.clearfix .job-name::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job-brief .job-info .salary::text")
        item_loader.add_css("job_desc", ".job-desc")
        item_loader.add_css("job_addr", ".job-brief .job-info .where::text")
        job_item = item_loader.load_item()

        return job_item     # 返回的job_item将传入pipeline
示例#27
0
 def parse_detail(self, response):
     item_loader = archinaItemLoader(item=archina_Item(), response=response)
     item_loader.add_css('title', ".col-left h1::text")  #列表第一个
     item_loader.add_xpath('create_date',
                           '//*[@id="Article"]/h1/span/text()')
     item_loader.add_css('content', ".col-left .content")
     item_loader.add_value('url', response.url)
     item_loader.add_value("crawl_time", datetime.now())
     item_loader.add_value('url_object_id', get_md5(response.url))
     ar_item = item_loader.load_item()
     yield ar_item
示例#28
0
文件: boss.py 项目: evahere/JobSpider
    def parse_job(self, response):
        item_loader = JobItemLoader(item=JobItem(), response=response)
        item_loader.add_css("title", ".job-primary .info-primary .name h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job-primary .info-primary .name .salary::text")
        item_loader.add_css("job_desc", ".detail-content .job-sec .text")
        item_loader.add_css("job_addr", ".job-primary .info-primary p")
        job_item = item_loader.load_item()

        return job_item     #  返回的job_item将传入pipeline
示例#29
0
    def parse_page(self,response):
        # 使用自定义ArticleItemLoader

        item_loader=TianYanChaItemLoader(item=TianYanChaItem(),response=response)

        item_loader.add_xpath('compname',"//div[@class='header']/h1/text()")
        item_loader.add_xpath('phone',"//div[@class='detail ']/div[1]/div[1]/span[2]/text()|//div[@class='detail']/div[1]/div[1]/span[2]/text()")
        item_loader.add_xpath('email',"//div[@class='detail']/div/div[2]/span[@class='email']/text()")
        item_loader.add_value('url',response.url)
        item_loader.add_value('object_id',common.get_md5(response.url))
        # 法定代表人
        item_loader.add_xpath('fddb',"//div[@id='_container_baseInfo']/table/tbody/tr/td[1]/div[1]/div/div[2]/div/a/@title")
        # 注册资本
        item_loader.add_xpath('zczb',"//div[@id='_container_baseInfo']/table/tbody/tr/td[2]/div[2]/@title")
        # 注册时间
        item_loader.add_xpath('zctime',"//div[@id='_container_baseInfo']/table[1]/tbody/tr[2]/td[1]/div[2]/text/text()")
        # 公司状态
        item_loader.add_xpath('gszt',"//div[@id='_container_baseInfo']/table/tbody/tr[3]/td/div[2]/@title")
        # 工商id
        item_loader.add_xpath('gsid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[1]/td[2]/text()")
        # 组织机构id
        item_loader.add_xpath('orgid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[1]/td[4]/text()")
        # 信用id
        item_loader.add_xpath('xyid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[2]/td[2]/text()")
        # 公司类型
        item_loader.add_xpath('gstype',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[2]/td[4]/text()")
        # 纳税人id
        item_loader.add_xpath('nsrid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[3]/td[2]/text()")
        # 行业
        item_loader.add_xpath('hy',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[3]/td[4]/text()")
        # 营业期限
        item_loader.add_xpath('yyqx',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[4]/td[2]/span/text()")
        # 核准日期
        item_loader.add_xpath('hzrq',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[4]/td[4]/text/text()")
        # 公司规模
        item_loader.add_xpath('size',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[5]/td[4]/text()")
        # 实缴资本
        item_loader.add_xpath('sjzb',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[6]/td[2]/text()")
        # 登记机构
        item_loader.add_xpath('djjg',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[6]/td[4]/text()")
        # 地址
        item_loader.add_xpath('addr',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[8]/td[2]/text()")
        # 经营范围
        item_loader.add_xpath('jyfw',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[9]/td[2]/span/span/span[1]/text()")
        # 高管姓名
        item_loader.add_xpath('zyry',"//div[@id='_container_staff']/div/table/tbody/tr/td/div/a[1]/text()")
        # 股东信息
        item_loader.add_xpath('gdxx',"//div[@id='_container_holder']/table/tbody/tr/td/div/div[2]/a/text()")

        tianyancha_item=item_loader.load_item()


        yield tianyancha_item
示例#30
0
    def parse_job(self, response):
        item_loader = ZhilianJobItemLoader(item=ZhilianJobItem(),
                                           response=response)
        item_loader.add_css("title", ".job-title.clearfix .job-name::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job-brief .job-info .salary::text")

        item_loader.add_value("crawl_time", datetime.now())
        job_item = item_loader.load_item()

        return job_item
示例#31
0
文件: xdg.py 项目: Astonyang/ticket
def generate_time_md5():
    t = str(time.time())
    return common.get_md5(t)
示例#32
0
文件: xdg.py 项目: Astonyang/ticket
def get_user_file(username, filename):
    d = os.path.join(xdg_cache_home, PROGRAM_NAME, common.get_md5(username))
    if not os.path.isdir(d):
        os.makedirs(d)
    return os.path.join(d, filename)