Exemplo n.º 1
0
 def query_status(self, id):
     """
     查询status
     """
     db = StoreMysql(**config.baidu_spider_move)
     query_status_sql = 'select `status` from {} where id = {}'.format(
         self.rank_store.table, id)
     try:
         result = db.query(query_status_sql)
         db.close()
         return result[0][0]
     except:
         print "query_status error"
         traceback.print_exc()
Exemplo n.º 2
0
    def start_requests(self):
        try:
            while 1:
                if self.is_get_tasks():
                    db = StoreMysql(**config.local_content)
                    update_time = str(datetime.now()).split(".")[0]
                    sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc  limit 0, {}".format(
                        self.step)
                    rows = db.query(sql)
                    self.log_record.info(
                        "datetime:{},task_results length:{}".format(
                            datetime.now(), len(rows)))
                    ids = list()
                    if rows:
                        for word in rows:
                            task_id = word[0]
                            ids.append({
                                "id": task_id,
                                "update_time": update_time
                            })

                            keyword = word[1]
                            for i in range(1, 11):
                                send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format(
                                    keyword, i)
                                urls = [{
                                    "url": send_url,
                                    "type": 1,
                                    "ext_type": 1,
                                    'keyword': keyword,
                                    'task_id': task_id,
                                    'unique_key': self.get_unique_key()
                                }]
                                self.send_get_spider(urls)

                        self.stores[0].store_table(ids,
                                                   "keywords",
                                                   type=2,
                                                   field="id")
                    else:
                        time.sleep(60 * 10)
                    db.close()
                time.sleep(60 * 1)
        except Exception:
            print traceback.format_exc()
Exemplo n.º 3
0
 def save(self, device, keyword_id, rank):
     try:
         db = StoreMysql(**config.baidu_spider_move)
         sql = """insert into rank_daily(platform, out_id, rank, date, created_at, updated_at) values('{}', {}, {}, date(now()), now(), now())
 on duplicate key update updated_at = now(), rank = case when values(rank) between 1 and 10 then values(rank) else rank end""".format(
             device, keyword_id, rank)
         db.do(sql)
         db.close()
     except:
         print traceback.format_exc()
         db.close()
Exemplo n.º 4
0
 def exec_sql(self, sql):
     db = StoreMysql(**config.baidu_spider_move)
     try:
         result = db.query(sql)
         db.close()
         return result
     except Exception:
         db.close()
Exemplo n.º 5
0
    def start_requests(self):
        try:
            while 1:
                if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \
                        and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize:
                    db = StoreMysql(**config.weixin_content)
                    source = SourceStore(config.weixin_content)
                    update_time = str(datetime.now()).split(".")[0]
                    sql = "select id, keyword from content_center.keywords order by update_time asc,  priority desc  limit 0, {};".format(
                        self.step)
                    rows = db.query(sql)
                    self.log.info("datetime:{},task_results length:{}".format(
                        datetime.now(), len(rows)))
                    ids = list()
                    if rows:
                        for word in rows:
                            task_id = word[0]
                            ids.append({
                                "id": task_id,
                                "update_time": update_time
                            })

                            keyword = word[1]
                            send_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_='.format(
                                keyword)
                            urls = [{
                                "url": send_url,
                                "type": 1,
                                "ext_type": 3,
                                'keyword': keyword,
                                'task_id': task_id,
                                'unique_key': self.get_unique_key()
                            }]
                            self.send_get_spider(urls)

                        source.store_table(ids, "keywords", type=2, field="id")
                    db.close()
                time.sleep(60 * 2)
        except Exception:
            print traceback.format_exc()
Exemplo n.º 6
0
    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:
        :param stores:
        :return:
        """
        try:
            result = results["result"]
            type = results["type"]
            if type == 1:
                # log_start = time.time()
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"] + info["we_name"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                weixin_content = {
                    "summary": data.get("summary", ""),
                    "content": data.get("content", ""),
                    "keyword": data.get("keyword", ""),
                    "title": data.get("title", ""),
                    "wechat_name": data.get("wechat_name", ""),
                    "wechat_num": data.get("wechat_num", "")
                }
                s_id = stores[0].store_table_one(
                    weixin_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.weixin_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(2)
        except:
            print(traceback.format_exc())
Exemplo n.º 7
0
    def to_store_results(self, results, stores):
        """
            type 1  列表页 title name  去重
                2  详情页 数据
        :param results:从store里读取的一个封装好的字典{'result':{'title':'...','abstract':'...'},'type':2}
        :param stores:
        :return:
        """
        try:
            result = results["result"]  # 这个是真正的结果,格式:字典
            type = results["type"]  # 处理方式,type=1 处理列表页,2:处理详情页
            if type == 1:
                # log_start = time.time()
                # keyword = results["keyword"]
                for info in result:
                    log_md5 = UtilMD5.md5(info["title"])
                    sql = "insert ignore into {}(md5) values('{}')".format(
                        self.md5_table, str(log_md5))
                    s_id = stores[0].insert_row(sql)  # 返回一个rowcount(受影响的行?)
                    if s_id > 0:
                        # self.spider_count += 1
                        urls = [{
                            "url": info['url'],
                            "type": 1,
                            "ext_type": 2,
                            'info': info,
                            'unique_key': self.get_unique_key()
                        }]
                        self.send_get_spider(urls)  # 封装列表页获取的的url,去请求
                    else:
                        self.repeat_count += 1

                if self.repeat_count > 1000:
                    self.log_record.info("repeat_count:{}".format(
                        self.repeat_count))
                    self.repeat_count = 0

            elif type == 2:
                data = result
                if not self.judge_china(data["content"]):
                    # 没有中文
                    self.no_china_count += 1
                    if self.no_china_count > 1000:
                        self.log_record.info("no_china_count:{}".format(
                            self.no_china_count))
                        self.no_china_count = 0
                    return

                toutiao_content = {
                    "category": data.get("category", ""),
                    "content": data.get("content", ""),
                    "publish_time": data.get("publish_time", ""),
                    "title": data.get("title", ""),
                    "abstract": data.get("abstract", ""),
                    "tags": data.get("tags", ""),
                    'url': data.get('url', ''),
                    'keyword': data.get('keyword', '')
                }
                s_id = stores[0].store_table_one(
                    toutiao_content, "news_{}".format(self.table_index))
                if s_id > 0:
                    if s_id % self.table_count == 0:
                        db = StoreMysql(**config.toutiao_content)

                        update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format(
                            self.table_index)
                        db.do(update_sql)

                        self.table_index += 1
                        db.do(self.create_table_sql.format(self.table_index))

                        insert_sql = "insert into spider_table(table_name) values('news_{}')".format(
                            self.table_index)
                        db.do(insert_sql)

                        time.sleep(1)
                        db.close()
                else:
                    time.sleep(0.1)
                time.sleep(4)
        except:
            print(traceback.format_exc())