Пример #1
0
    def process_item(self, item, spider):
        # 将item转换为es数据
        lagou = LagouType()
        lagou.title = item.get("title", "")
        lagou.url = item.get("url", "")
        lagou.url_object_id = item.get("url_object_id", "")
        lagou.salary = item.get("salary", "")
        # lagou.job_city = item.get("job_city", "")
        # lagou.work_years = item.get("work_years", "")
        # lagou.degree_need = item.get("degree_need", "")
        # lagou.job_type = item.get("job_type", "")
        # lagou.publish_time = item.get("publish_time", "")
        # lagou.job_advantage = item.get("job_advantage", "")
        lagou.job_desc = item.get("job_desc", "")
        lagou.job_addr = item.get("job_addr", "")
        # lagou.company_name = item.get("company_name", "")
        # lagou.company_url = item.get("company_url", "")
        # lagou.tags = item.get("tags", "")

        lagou.meta.id = item["url_object_id"]

        lagou.save()

        return item
Пример #2
0
    def save_to_es(self):
        artical = LagouType()
        artical.title = self['title']
        artical.create_date = self["crawl_time"]
        artical.url = self["url"]
        artical.url_object_id = self["url_object_id"]
        artical.salary = self["salary"]
        artical.job_city = self["job_city"]
        artical.work_years = self["work_years"]
        artical.degree_need = self["degree_need"]
        artical.job_type = self["job_type"]
        artical.tags = self["tags"]
        artical.publish_time = self["publish_time"]
        artical.job_advantage = self["job_advantage"]
        artical.job_desc = self["job_desc"]
        artical.job_addr = self["job_addr"]
        artical.company_name = self["company_name"]
        artical.company_url = self["company_url"]
        artical.suggest = gen_suggests(LagouType._doc_type.index,
                                       ((artical.title, 10),
                                        (artical.tags, 7)))
        artical.save()
        redis_cli.incr("lagou_count")

        return
 def save_to_es(self):
     #turn the item in the ES's item
     job = LagouType()
     job.title = self["title"]
     job.url = self["url"]
     job.url_object_id = self["url_object_id"]
     job.salary = self["salary"]
     job.job_city = self["job_city"]
     job.degree_need = self["degree_need"]
     job.job_type = self["job_type"]
     job.job_advantage = self["job_advantage"]
     job.job_desc = self["job_desc"]
     job.job_addr = self["job_addr"]
     job.company_name = self["company_name"]
     job.company_url = self["company_url"]
     job.tags = self["tags"]
     job.suggest = get_suggests(LagouType._doc_type.index,
                                ((job.title, 10), (job.tags, 7),
                                 (job.job_desc, 4), (job.job_type, 3)))
     job.save()
     redis_cli.incr("job_count")
     return
Пример #4
0
    def save_to_es(self):
        crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT)
        job_id = extract_num(self["url"])
        article = LagouType()
        article.meta.id = job_id
        article.title = self['title']
        article.url = self['url']
        article.salary = self['salary']
        article.job_city = self['job_city']
        article.work_years = self['work_years']
        article.degree_need = self['degree_need']
        article.job_type = self['job_type']
        article.publish_time = self['publish_time']
        article.job_advantage = self['job_advantage']
        article.job_desc = self['job_desc']
        article.job_addr = self['job_addr']
        article.company_name = self['company_name']
        article.crawl_time = crawl_time
        article.crawl_update_time = crawl_time

        article.suggest = gen_suggest(LagouType._doc_type.index, ((article.title, 10), (article.company_name, 9), (article.job_desc, 8), (article.job_addr, 7)))

        redis_cli.incr("lagou_count")
        article.save()
Пример #5
0
    def save_to_es(self):
        lagou = LagouType()
        lagou.title = self['title']
        lagou.url = self['url']
        lagou.url_object_id = self['url_object_id']
        lagou.salary = self['salary']
        lagou.job_city = self['job_city']
        lagou.work_years = self['work_years']
        lagou.degree_need = self['degree_need']
        lagou.job_type = self['job_type']
        lagou.publish_time = self['publish_time']
        lagou.job_advantage = self['job_advantage']
        lagou.job_desc = remove_tags(self['job_desc'])
        lagou.job_addr = self['job_addr']
        lagou.company_name = self['company_name']
        lagou.company_url = self['company_url']
        if 'tags' in self:
            lagou.tags = self['tags']
        lagou.crawl_time = self['crawl_time']
        lagou.suggest = gen_suggests(LagouType._doc_type.index,
                                     ((lagou.title, 10), (lagou.tags, 7),
                                      (lagou.job_desc, 5)))

        lagou.save()

        return