def save_to_es(self): crawl_time = datetime.datetime.now().strftime(SQL_DATE_FORMAT) create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATE_FORMAT) update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATE_FORMAT) article = ZhihuAnswerType() article.meta.id = self['zhihu_id'] article.url = self['url'] article.question_id = self['question_id'] article.content = remove_tags(self['content']) article.praise_num = self['praise_num'] article.comments_num = self['comments_num'] article.create_time = create_time article.update_time = update_time article.crawl_time = crawl_time article.save() return
def save_to_es(self): answer = ZhihuAnswerType() answer.meta.id = self['zhihu_id'] answer.zhihu_id = self['zhihu_id'] answer.url = self['url'] answer.question_id = self['question_id'] answer.author_id = self['author_id'] answer.content = self['content'] answer.praise_num = self.get('praise_num', 0) answer.comments_num = self.get('comments_num', 0) if 'create_time' in self: answer.create_time = datetime.datetime.fromtimestamp( self['create_time']) else: answer.create_time = datetime.datetime.now().date() if 'update_time' in self: answer.update_time = datetime.datetime.fromtimestamp( self['update_time']) else: answer.update_time = datetime.datetime.now().date() answer.crawl_time = self['crawl_time'] answer.suggest = gen_suggests(ZhihuAnswerType._index._name, [(answer.content, 5)]) answer.save() # redis中记录爬取document数目 redis_cli.incr("zhihuanswer_count") return
def save_to_es(self): #turn the item in the ES's item answer = ZhihuAnswerType() answer.zhihu_id = self["zhihu_id"] answer.url = self["url"] answer.question_id = self["question_id"] answer.author_id = self["author_id"] answer.answer_excerpt = self["answer_excerpt"] answer.content = self["content"] answer.praise_num = self["praise_num"] answer.comments_num = self["comments_num"] answer.create_time = datetime.datetime.fromtimestamp( self["create_time"]).strftime(SQL_DATETIME_FORMAT) answer.update_time = datetime.datetime.fromtimestamp( self["update_time"]).strftime(SQL_DATETIME_FORMAT) answer.crawl_time = self["crawl_time"] answer.suggest = get_suggests(ZhihuAnswerType._doc_type.index, ((answer.content, 5), )) answer.save() redis_cli.incr("ans_count") return
def save_to_es(self): answer = ZhihuAnswerType() answer.zhihu_id = self["zhihu_id"] answer.url = self["url"] answer.question_id = self["question_id"] answer.author_id = self["author_id"] answer.content = self["content"] answer.praise_num = self["praise_num"] answer.comments_num = self["comments_num"] answer.create_time = datetime.datetime.fromtimestamp( self["create_time"]).strftime(SQL_DATETIME_FORMAT) answer.update_time = datetime.datetime.fromtimestamp( self["update_time"]).strftime(SQL_DATETIME_FORMAT) answer.crawl_time = self["crawl_time"].strftime(SQL_DATETIME_FORMAT) # question.suggest = gen_suggests(ZhihuQuestionItem._doc_type.index, ((question.title,10),(question.topics, 7))) answer.save() # redis_cli.incr("jobbole_count") return
def save_to_es(self): # 讲item转换成es的数据 create_time = datetime.datetime.fromtimestamp( self["create_time"]).strftime("%Y-%m-%d") update_time = datetime.datetime.fromtimestamp( self["update_time"]).strftime("%Y-%m-%d") zhihuanswer = ZhihuAnswerType() zhihuanswer.zhihu_id = self["zhihu_id"] zhihuanswer.question_id = self["question_id"] zhihuanswer.author_id = self["author_id"] zhihuanswer.content = self["content"] zhihuanswer.praise_num = self["praise_num"] zhihuanswer.comments_num = self["comments_num"] zhihuanswer.create_time = create_time zhihuanswer.update_time = update_time zhihuanswer.crawl_time = self["crawl_time"].strftime("%Y-%m-%d") zhihuanswer.suggest = gen_suggest(ZhihuAnswerType._doc_type.index, ((zhihuanswer.content, 10), )) zhihuanswer.save() redis_cli.incr("ZhihuAnswer_count") return
def save_to_es(self): article = ZhihuAnswerType() article.title = self.get("title", "") article.answer_id = self.get("answer_id", "") article.question_id = self.get("question_id", "") article.url = self.get("url", "") article.author_id = self.get("author_id", "") article.praise_num = self.get("praise_num", 0) article.comments_num = self.get("comments_num", 0) article.create_time = self.get("create_time", "1970-01-01 00:00:00") article.update_time = self.get("update_time", "1970-01-01 00:00:00") article.crawl_time = self.get("crawl_time", "1970-01-01 00:00:00") article.content = self.get("content", "") # article.suggest = es_method.gen_suggests(article._doc_type.index,((article.content,7),(article.title,10))) article.save()