Exemplo n.º 1
0
 def save(self, device, keyword_id, rank):
     try:
         db = StoreMysql(**config.baidu_spider_move)
         sql = """insert into rank_daily(platform, out_id, rank, date, created_at, updated_at) values('{}', {}, {}, date(now()), now(), now())
 on duplicate key update updated_at = now(), rank = case when values(rank) between 1 and 10 then values(rank) else rank end""".format(
             device, keyword_id, rank)
         db.do(sql)
         db.close()
     except:
         print traceback.format_exc()
         db.close()
Exemplo n.º 2
0
    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:
        :param stores:
        :return:
        """
        try:
            result = results["result"]
            type = results["type"]
            if type == 1:
                # log_start = time.time()
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"] + info["we_name"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                weixin_content = {
                    "summary": data.get("summary", ""),
                    "content": data.get("content", ""),
                    "keyword": data.get("keyword", ""),
                    "title": data.get("title", ""),
                    "wechat_name": data.get("wechat_name", ""),
                    "wechat_num": data.get("wechat_num", "")
                }
                s_id = stores[0].store_table_one(
                    weixin_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.weixin_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(2)
        except:
            print(traceback.format_exc())
Exemplo n.º 3
0
    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:从store里读取的一个封装好的字典{'result':{'title':'...','abstract':'...'},'type':2}
        :param stores:
        :return:
        """
        try:
            result = results["result"]  # 这个是真正的结果,格式:字典
            type = results["type"]  # 处理方式,type=1 处理列表页,2:处理详情页
            if type == 1:
                # log_start = time.time()
                # keyword = results["keyword"]
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)  # 返回一个rowcount(受影响的行?)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)  # 封装列表页获取的的url,去请求
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                toutiao_content = {
                    "category": data.get("category", ""),
                    "content": data.get("content", ""),
                    "publish_time": data.get("publish_time", ""),
                    "title": data.get("title", ""),
                    "abstract": data.get("abstract", ""),
                    "tags": data.get("tags", ""),
                    'url': data.get('url', ''),
                    'keyword': data.get('keyword', '')
                }
                s_id = stores[0].store_table_one(
                    toutiao_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.toutiao_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(4)
        except:
            print(traceback.format_exc())