示例#1
0
    def save_to_es(self):
        zhihu_question = ZhihuQuestion()
        zhihu_question.title_suggest = gen_suggests(
            ZhihuQuestion._doc_type.index,
            ((zhihu_question.title, 10), (zhihu_question.topics, 7)))
        zhihu_question.title = self['title']
        zhihu_question.content = self["content"]
        zhihu_question.url = self["url"]
        zhihu_question.question_id = self["zhihu_id"][0]
        zhihu_question.answer_num = extract_num("".join(self["answer_num"]))
        zhihu_question.comments_num = extract_num("".join(
            self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            self["watch_user_num"][0] = str.replace(self["watch_user_num"][0],
                                                    ",", "")
            zhihu_question.watch_user_num = int(self["watch_user_num"][0])
            self["watch_user_num"][1] = str.replace(self["watch_user_num"][1],
                                                    ",", "")
            zhihu_question.click_num = int(self["watch_user_num"][1])
        else:
            zhihu_question.watch_user_num = int(self["watch_user_num"][0])
            zhihu_question.click_num = 0

        zhihu_question.topics = self["topics"]
        zhihu_question.meta.id = self["zhihu_id"][0]

        zhihu_question.save()
        redis_cli.incr("zhihu_question_count")
        return
示例#2
0
    def get_insert_sql(self):
        insert_sql = '''
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, 
            click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
            on duplicate key UPDATE content=VALUES(content), answer_num=VALUES(answer_num), 
            comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num),
            crawl_update_time=VALUES(crawl_time)
        '''

        # 这个item是用ItemLoader加载的item,每一项都是list,可以像JobBoleArticleItem那样处理
        # 也可以自己手动处理
        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = self['title'][0]
        content = self['content'][0]
        answer_num = extract_num((''.join(self['answer_num']).replace(',',
                                                                      '')))
        comments_num = extract_num(
            (''.join(self['comments_num']).replace(',', '')))
        watch_user_num = extract_num(self['watch_user_num'][0].replace(
            ',', ''))
        click_num = extract_num(self['click_num'][1].replace(',', ''))
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#3
0
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
              watch_user_num, click_num, crawl_time
              ) 
            VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content),answer_num=VALUES(answer_num),
               comments_num=VALUES(comments_num),watch_user_num=VALUES(watch_user_num),click_num=VALUES(click_num)
        """

        zhihu_id = self['zhihu_id'][0] #不能用''.join(self['zhihu_id'])因为list里面是int
        topics = ','.join(self['topics'])
        url = ''.join(self['url'])
        title = ''.join(self['title'])
        content = ''.join(self['content'])
        answer_num = extract_num(''.join(self['answer_num'])) if self.get('answer_num', '') else 0
        comments_num = extract_num(''.join(self['comments_num']))
        watch_user_num = int(self['watch_user_num'][1].replace(',', ''))
        click_num = int(self['watch_user_num'][0].replace(',', ''))
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
              watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#4
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = 'insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)'

        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = self['title'][0]
        content = self['content'][0]
        answer_num = extract_num(''.join(self['answer_num']).replace(',',''))
        comments_num = extract_num(self['comments_num'][1].replace(',',''))

        if len(self['watch_user_num']) == 2:
            watch_user_num = int(self['watch_user_num'][0].replace(',',''))
            click_num = int(self['watch_user_num'][1].replace(',',''))
        else:
            watch_user_num = int(self['watch_user_num'][0].replace(',',''))
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
                  watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#5
0
文件: items.py 项目: abcd567/spiders
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, 
                answer_num, comments_num, watch_user_num, click_num, crawl_time
                )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num),
            comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num),
            click_num=VALUES(click_num)
        """
        # 另一种处理itemloader(返回原始数据是list)的方法
        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = self['title'][0]
        content = self['content'][0]
        answer_num = extract_num(''.join(self['answer_num']))
        comments_num = extract_num(''.join(self['comments_num']))
        watch_user_num = extract_num(self['watch_user_num'][0].replace(
            ',', ''))
        click_num = extract_num(self['watch_user_num'][1].replace(',', ''))
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#6
0
    def get_insert_sql(self):
        # 这里必须与pipelines下的do_insert()方法下的调用get_insert_sql一致,也与其他网站调用都是写统一的函数。像上面的JobboleArticleItem一样
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comment_num,
                watch_user_num, click_num, crawl_time
            )VALUES(%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content = VALUES(content),answer_num = VALUES(answer_num),comments_num = VALUES(comment_num),
            watch_user_num = VALUES(watch_user_num), click_num =VALUES(click_num), crawl_time = VALUES(crawl_time)
        
        """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = "".join(self["answer_num"])
        comments_num = extract_num("".join(self["comments_num"]))
        watch_user_num = extract_num("".join(self["watch_user_num"]))
        click_num = extract_num("".join(self["click_num"]))
        crawl_time = datetime.datetime.now().strftime(
            SQL_DATETIME_FORMAT)  # 上strtime是想把time转换成str类型的

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)
        return insert_sql, params
示例#7
0
    def save_to_es(self):
        question = QuestionType()
        question.title = "".join(self["title"])
        question.topics = ",".join(self["topics"])

        #question.crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
        question.url = self["url"][0]
        question.zhihu_id = self["zhihu_id"][0]
        question.answer_num = extract_num("".join(self["answer_num"]))
        question.comment_nums = extract_num("".join(self["comments_num"]))
        if len(self["watch_user_num"]) == 2:
            question.watch_user_num = self["watch_user_num"][0]
            question.click_num = self["watch_user_num"][1]
        else:
            question.watch_user_num = self["watch_user_num"][0]
            question.click_num = 0
        question.content = remove_tags("".join(self["content"]))

        question.suggest = gen_suggests(QuestionType._doc_type.index,
                                        ((question.title, 10),
                                         (question.topics, 7)))

        question.save()

        redis_cli.incr("zhihu_count")

        return
示例#8
0
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question 
            (zhihu_id,topics,title,url,content,answer_num,comments_num,watch_user_num,click_num,crawl_time) 
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        try:
            content = "".join(self["content"])
        except BaseException:
            content = "无"
        try:
            answer_num = extract_num("".join(self["answer_num"]))
        except BaseException:
            answer_num = 0
        comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            watch_user_num = extract_num(self["watch_user_num"][0])
            click_num = extract_num(self["watch_user_num"][1])
        else:
            watch_user_num = extract_num(self["watch_user_num"][0])
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#9
0
    def get_insert_sql(self):
        # 插入知乎question表的sql语句
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,
              watch_user_num, click_num, crawl_time
              )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
              watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))

        if len(self["watch_user_num"]) == 2:
            self["watch_user_num"][0] = str.replace(self["watch_user_num"][0],
                                                    ",", "")
            watch_user_num = int(self["watch_user_num"][0])
            self["watch_user_num"][1] = str.replace(self["watch_user_num"][1],
                                                    ",", "")
            click_num = int(self["watch_user_num"][1])
        else:
            watch_user_num = int(self["watch_user_num"][0])
            click_num = 0

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#10
0
    def get_insert_sql(self):
        insert_sql = """
                            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
                            watch_user_num, click_num, crawl_time)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s,%s, %s)
                            ON DUPLICATE KEY 
                            UPDATE content=VALUES(content),answer_num=VALUES(answer_num), 
                            comments_num=VALUES (comments_num),watch_user_num=VALUES (watch_user_num), 
                            click_num=VALUES (click_num)  
                            """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = "".join(self["title"])
        content = "".join(self["content"])
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))
        watch_user_num = extract_num("".join(self["watch_user_num"]))
        click_num = extract_num("".join(self["click_num"]))
        crawl_time = datetime.datetime.now().strftime("")

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#11
0
    def save_to_es(self):
        """
        存储到elasticsearch
        """
        question = ZhiHuQuestionType()
        question.zhihu_id = self['zhihu_id'][0]
        question.topics = ",".join(self["topics"])
        question.url = self["url"][0]
        question.title = "".join(self["title"])
        question.content = content = "".join(self["content"])
        try:
            question.answer_num = self["answer_num"][0]
        except:
            question.answer_num = 0
        question.comment_num = extract_num("".join(self["comment_num"]))
        if len(self["watch_user_num"]) == 2:
            question.watch_user_num = self["watch_user_num"][0]
            question.click_num = self["watch_user_num"][1]
        else:
            question.watch_user_num = self["watch_user_num"][0]
            question.click_num = 0
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
        question.crawl_time = crawl_time

        question.suggest = gen_suggests(es_question,
                                        ZhiHuQuestionType._doc_type.index,
                                        ((question.title, 10),
                                         (question.topics, 7),
                                         (question.content, 3)))

        question.save()
        redis_cli.incr("question_count")
        return
示例#12
0
 def get_insert_sql(self):
     insert_sql = '''
                             insert into zhihu_question(zhihu_id,topics,url,content,title,answer_num,comments_num,watch_user_num
                             ,click_num,crawl_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE content VALUES
                             (content),comments_num = VALUES (comments_num), answer_num=VALUES (answer_num), watch_user_num
                             =VALUES (watch_user_num),click_num = VALUES (click_num),crawl_update_time = VALUES (crawl_time)
     '''
     zhihu_id = int(''.join(self['zhihu_id']))
     topics = ','.join(self['topics'])
     url = ''.join(self['url'])
     content = ''.join(self['content'])  if self['content'] else 'nothing'
     title = ''.join(self['title'])
     answer_num = extract_num(''.join(self['answer_num']))
     comments_num = extract_num(''.join(self['comments_num']))
     watch_user_num = self['watch_user_num'][0].replace(',','')
     click_num =  self['watch_user_num'][1].replace(',','')
     crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
     parmas = (zhihu_id,topics,url,content,title,answer_num,comments_num,watch_user_num,click_num,crawl_time)
     return insert_sql,parmas
示例#13
0
    def get_insert_sql(self):
        #插入知乎question表的sql语句
        insert_sql = """
                        insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comments_num,watch_user_num,click_num,crawl_time)
                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        """
        zhihu_id = self["zhihu_id"][0]
        topics = ",".join(self["topics"])
        url = self["url"][0]
        title = self["title"][0]
        content = self["content"][0]
        answer_num = extract_num("".join(self["answer_num"]))
        comments_num = extract_num("".join(self["comments_num"]))
        watch_user_num = extract_num("".join(self["watch_user_num"]))
        click_num = extract_num("".join(self["click_num"]))
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params
示例#14
0
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question
            (question_id, topics, url, title, content, answers_num, comments_num, watch_users_num, clicked_num)
            values 
            (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        question_id = int("".join(self['question_id']))
        topics = ",".join(self['topics'])
        url = self['url'][0]
        title = self['title'][0]
        content = self['content'][0]
        answers_num = extract_num("".join(self['answers_num']))
        comments_num = extract_num("".join(self['comments_num']))
        watch_users_num = int(self['watch_users_num'][0])
        clicked_num = int(self['watch_users_num'][1])

        params = (question_id, topics, url, title, content, answers_num,
                  comments_num, watch_users_num, clicked_num)

        return insert_sql, params
示例#15
0
 def get_insert_sql(self):
     # 插入知乎question表的sql语句
     insert_sql = '''
         insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, 
             watch_user_num, click_num, crawl_time            
         )VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
         ON DUPICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num),
         comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num),
         click_num=VALUES(click_num)
     '''
     zhihu_id = self['zhihu_id'][0]
     topics = ",".join(self['topics'])
     url = self['url'][0]  # 等价于"".join(self['url'])
     title = "".join(self['title'])
     content = "".join(self['content'])
     answer_num = extract_num("".join(self['answer_num']))
     comments_num = extract_num("".join(self['comments_num']))
     watch_user_num = extract_num("".join(self['watch_user_num']))
     click_num = extract_num("".join(self['click_num']))
     crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
     params = (zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time)
     return insert_sql, params
    def get_insert_sql(self):
        insert_sql = """
            insert into lagou_job(title, url, salary, job_city, work_years, degree_need,
            job_type, publish_time, job_advantage, job_desc, job_addr, company_url, company_name, job_id)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE job_desc=VALUES(job_desc)
        """

        job_id = extract_num(self["url"])
        params = (self["title"], self["url"], self["salary"], self["job_city"],
                  self["work_years"], self["degree_need"], self["job_type"],
                  self["publish_time"], self["job_advantage"],
                  self["job_desc"], self["job_addr"], self["company_url"],
                  self["company_name"], job_id)

        return insert_sql, params
示例#17
0
    def get_insert_sql(self):
        insert_sql = """
        insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,
        comments_num,watch_user_num,click_num,crawl_time) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        ON DUPLICATE KEY UPDATE content=VALUES(content),
        comments_num=VALUES(comments_num), answer_num=VALUES(answer_num), click_num=VALUES(click_num)
        """
        # 这里做这些处理是因为用itemloader得到的item的值都是list的,所以要做转换
        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = ''.join(self['title'])
        content = '\n'.join(self['content'])
        answer_num = extract_num(''.join(self['answer_num']))
        comments_num = extract_num(''.join(self['comments_num']))
        watch_user_num = extract_num(self['watch_user_num'][0])
        click_num = extract_num(self['watch_user_num'][1]
                                if len(self['watch_user_num']) == 2 else '0')
        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        values = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)
        return insert_sql, values
示例#18
0
    def get_insert_sql(self):
        #插入知乎表
        insert_sql = '''
                        insert into zhihu_question(zhihu_id,topics,url,title,content,answer_num,comment_num,
                        watch_user_num,click_num,crawl_time) 
                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        ON duplicate key UPDATE content = VALUES (content),
                        answer_num = VALUES(answer_num),comment_num = VALUES (comment_num)
                        ,watch_user_num = VALUES (watch_user_num),click_num = VALUES (click_num),
                    '''

        zhihu_id = self['zhihu_id'][0]
        topics = ','.join(self['topics'])
        url = self['url'][0]
        title = ''.join(self['title'])
        content = ''.join(self['content'])
        answer_num = extract_num(''.join(self['answer_num']))
        comment_num =extract_num(''.join(self['comment_num']))
        watch_user_num = extract_num(''.join(self['watch_user_num']))
        click_num = extract_num(''.join(self['click_num']))
        crawl_time = datetime.datetime.now().strptime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title,content,answer_num,comment_num,watch_user_num,click_num,crawl_time)
        return insert_sql,params
示例#19
0
    def get_insert_sql(self):
        insert_sql = """
            insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num, watch_user_num, click_num, crawl_time)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num), watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
        """
        zhihu_id = self['zhihu_id'][0]
        topics = ",".join(self['topics'])
        url = self['url'][0]
        title = self['title'][0]
        content = self['content'][0]
        answer_num = extract_num(self['answer_num'][0])
        watch_user_num = int(self['watch_user_num'][0])
        click_num = int(self['watch_user_num'][1])
        comments_num = self['comments_num'][0]

        crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)

        params = (zhihu_id, topics, url, title, content, answer_num,
                  comments_num, watch_user_num, click_num, crawl_time)

        return insert_sql, params