def update_train_rank(user_id): logger.info('[Account] update_train_rank <User #{}>'.format(user_id)) user_info = session.query(UserInfo).filter_by(user_id=user_id).first() if user_info is None: logger.warn( "[Account] update_train_rank => UserInfo of User #{} does't exists" .format(user_id)) return accounts = session.query(Account).filter_by(user_id=user_id) ranks = [] for account in accounts: solved, submitted = account.solved, account.submitted oj_name = account.oj_name top_account = session.query(Account).filter_by(oj_name=oj_name)\ .order_by(Account.solved.desc(), Account.submitted.desc())\ .first() max_solved = max(top_account.solved, solved) if max_solved == 0: this_rank = 1000 else: this_rank = (solved / max_solved) * 1000 ranks.append(this_rank) logger.debug("[Account] update_train_rank <User #{}> {} => {}".format( user_id, oj_name, this_rank)) end_rank = sum(ranks) / len(ranks) user_info.train_rank = end_rank user_info.save() logger.info("[Account] update_train_rank success <User #{}> => {}".format( user_id, end_rank))
def push_submit_to_queue(submit_id): logger.info('[redis] push submit #{} to queue'.format(submit_id)) redis.lpush(RedisKey.achieve_mq, json.dumps({ 'type': 'submit', 'id': submit_id }))
def set_general(self, solved, submitted): self.solved = solved self.submitted = submitted self.save() yield ThreadPool.submit(update_train_rank, self.user_id) logger.info('{} 更新 solved: {} / submitted: {}'.format( self, solved, submitted))
def init_http_client(): try: httpclient.AsyncHTTPClient.configure( "tornado.curl_httpclient.CurlAsyncHTTPClient") logger.info('[ACM-Spider] 配置 CurlAsyncHTTPClient 成功') except Exception as ex: logger.error( '[ACM-Spider] 配置 CurlAsyncHTTPClient 失败: {}'.format(ex))
async def account_producer(): """ 待爬取账号生产者 """ logger.info('[AccountProducer] 开始获取可用账号放入队列 ...') while True: cur = account.get_available_account() if cur and is_spider_open(cur.oj_name): await AccountQueue.put(cur) logger.info('{0} ===> 账号入队列 AccountQueue(size={1})'.format(cur, AccountQueue.qsize())) else: await gen.sleep(10)
async def fetch_cookie(self): if self.cookie: return True response = await self.load_page(self.index_url) if not response: return False self.cookie = response.headers['Set-Cookie'] self.cookie = self.cookie.split(';')[0] + ';' logger.info('{} fetch cookie success {}'.format(self.TAG, self.account)) return True
async def fetch_cookie(self): if self.cookie: return True response = await self.load_page(self.index_url) if not response: return False self.cookie = response.headers['Set-Cookie'] self.cookie = self.cookie.split(';')[0] + '; username={};'.format(self.account.nickname) logger.info('{} fetch cookie success'.format(self.TAG)) return True
def spider_init(): """ 实例化SpiderRunner, 放入SpiderFactory """ logger.info('[SpiderInit] 生成 SpiderRunner 缓存 ...') for oj, oj_queue in SpiderFactory.items(): spider_name = settings.SUPPORT_OJ[oj] + 'Spider' spider_class = getattr(sys.modules['app.spiders.' + spider_name], spider_name) while oj_queue.qsize() < oj_queue.maxsize: oj_queue.put_nowait(spider_class()) logger.info('[{0}] 缓存池初始化 OK => size {1}'.format(spider_name, oj_queue.qsize()))
async def account_producer(): """ 待爬取账号生产者 """ logger.info('[AccountProducer] 开始获取可用账号放入队列 ...') while True: cur = account.get_available_account() if cur and is_spider_open(cur.oj_name): await AccountQueue.put(cur) logger.info('{0} ===> 账号入队列 AccountQueue(size={1})'.format( cur, AccountQueue.qsize())) else: await gen.sleep(10)
def spider_init(): """ 实例化SpiderRunner, 放入SpiderFactory """ logger.info('[SpiderInit] 生成 SpiderRunner 缓存 ...') for oj, oj_queue in SpiderFactory.items(): spider_name = settings.SUPPORT_OJ[oj] + 'Spider' spider_class = getattr(sys.modules['app.spiders.' + spider_name], spider_name) while oj_queue.qsize() < oj_queue.maxsize: oj_queue.put_nowait(spider_class()) logger.info('[{0}] 缓存池初始化 OK => size {1}'.format( spider_name, oj_queue.qsize()))
async def fetch_cookie(self): if self.cookie: return True response = await self.load_page(self.index_url) if not response: return False self.cookie = response.headers['Set-Cookie'] if self.cookie: self.cookie = self.cookie.split(';')[0] + ';' logger.info('{} {} fetch cookie success'.format(self.TAG, self.account)) return True else: self.cookie = '' return False
def update_train_rank(user_id): user = session.query(User).filter_by(id=user_id).first() logger.info('[Account] update_train_rank #{}'.format(user)) if user is None: logger.warn("[Account] update_train_rank => UserInfo of #{} does't exists".format(user)) return ranks = [] # normal_oj => sum(solved) / sum(top_solved) normal_oj = ['bnu', 'hdu', 'poj', 'vj'] accounts = session.query(Account).filter_by(user_id=user_id)\ .filter(Account.oj_name.in_(normal_oj)) solved_sum = sum([account.solved for account in accounts]) top_account = session.query(func.sum(Account.solved), Account.user_id)\ .filter(Account.oj_name.in_(normal_oj))\ .group_by(Account.user_id)\ .order_by(func.sum(Account.solved).desc())\ .first() if top_account: top_solved_sum = int(top_account[0]) this_rank = (solved_sum / top_solved_sum) * 1000 if top_solved_sum > 0 else 1000 ranks.append(this_rank) else: ranks.append(1000) # rating_oj => sum(rating / top_rating for every account) rating_oj = ['cf', 'bc'] accounts = session.query(Account).filter_by(user_id=user_id) \ .filter(Account.oj_name.in_(rating_oj)) for account in accounts: rating = account.solved oj_name = account.oj_name top_account = session.query(Account).filter_by(oj_name=oj_name)\ .order_by(Account.solved.desc(), Account.submitted.desc())\ .first() if not top_account: this_rank = 1000 else: top_rating = max(top_account.solved, rating) this_rank = (rating / top_rating) * 1000 ranks.append(this_rank) # end_rank = sum(ranks) print(ranks) user.train_rank = sum(ranks) user.save() logger.info("[Account] update_train_rank success #{} => sum({}) => {}".format(user, ranks, sum(ranks)))
async def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': self.account.nickname, 'userpass': self.account.password, 'login': '******' }) response = await self.fetch(self.login_url, method=HttpMethod.POST, headers={'Cookie': self.cookie}, body=post_body) code = response.code page = response.body.decode('gb2312') if (code != 200 and code != 302) or page.find('Sign Out') == -1: return False logger.info('{} {} login success'.format(self.TAG, self.account)) self.has_login = True return True
async def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': self.account.nickname, 'password': self.account.password, 'remember': 'on' }) headers = dict(Host='bestcoder.hdu.edu.cn', Cookie=self.cookie) response = await self.fetch(self.login_url, method=HttpMethod.POST, headers=headers, body=post_body) code = response.code page = response.body.decode('gb2312') if code != 200 and code != 302 or page.find('Logout') == -1: return False self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
async def login(self): if self.has_login: return True post_body = parse.urlencode({ 'user_id1': self.account.nickname, 'password1': self.account.password, 'B1': 'login', 'url': '/' }) headers = dict(Cookie=self.cookie) response = await self.fetch(self.login_url, method=HttpMethod.POST, body=post_body, headers=headers) code = response.code page = response.body.decode() if code != 200 and code != 302 or page.find('Log Out') == -1: return False self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
async def login(self): if self.has_login: return True post_body = parse.urlencode({ 'username': self.account.nickname, 'password': self.account.password }) headers = dict(Host='vjudge.net', Origin=self.domain, Referer='http://vjudge.net/index') response = await self.fetch(self.login_url, method=HttpMethod.POST, body=post_body, headers=headers, validate_cert=False) code = response.code res = response.body.decode() if code != 200 and code != 302 or res != 'success': return False self.cookie = response.headers['Set-Cookie'] self.has_login = True logger.info('{} login success {}'.format(self.TAG, self.account)) return True
async def spider_runner(idx): """ 爬虫运行地 """ logger.info('[SpiderRunner #{0}] 开始运行 ...'.format(idx)) while True: cur_account = await AccountQueue.get() logger.info('[SpiderRunner #{0}] {1} <=== account_queue(size={2})' .format(idx, cur_account, AccountQueue.qsize())) # let spider.run() worker = await SpiderFactory[cur_account.oj_name].get() worker.account = cur_account try: cur_account.set_status(account.AccountStatus.UPDATING) cur_account.save() await worker.run() cur_account.set_status(account.AccountStatus.NORMAL) except LoginException as ex: logger.error(ex) cur_account.set_status(account.AccountStatus.ACCOUNT_ERROR) await gen.sleep(60 * 2) except Exception as ex: logger.error(ex) logger.error(traceback.format_exc()) cur_account.set_status(account.AccountStatus.UPDATE_ERROR) await gen.sleep(60 * 2) finally: cur_account.save() # work done logger.info('[SpiderRunner #{0}] {1} work done'.format(idx, cur_account)) SpiderFactory[cur_account.oj_name].task_done() AccountQueue.task_done() await SpiderFactory[cur_account.oj_name].put(worker)
async def spider_runner(idx): """ 爬虫运行地 """ logger.info('[SpiderRunner #{0}] 开始运行 ...'.format(idx)) while True: cur_account = await AccountQueue.get() logger.info( '[SpiderRunner #{0}] {1} <=== account_queue(size={2})'.format( idx, cur_account, AccountQueue.qsize())) # let spider.run() worker = await SpiderFactory[cur_account.oj_name].get() worker.account = cur_account try: cur_account.set_status(account.AccountStatus.UPDATING) cur_account.save() await worker.run() cur_account.set_status(account.AccountStatus.NORMAL) except LoginException as ex: logger.error(ex) cur_account.set_status(account.AccountStatus.ACCOUNT_ERROR) except Exception as ex: logger.error(ex) logger.error(traceback.format_exc()) cur_account.set_status(account.AccountStatus.UPDATE_ERROR) finally: cur_account.save() # work done logger.info('[SpiderRunner #{0}] {1} work done'.format( idx, cur_account)) SpiderFactory[cur_account.oj_name].task_done() AccountQueue.task_done() await SpiderFactory[cur_account.oj_name].put(worker)
async def data_pool_consumer(): """ 爬取的数据消费协程 """ logger.info('[DataPoolConsumer] 数据消费协程开启 ... ') while True: while DataPool.empty(): await gen.sleep(10) new_data = await DataPool.get() # new submit if new_data['type'] == DataType.Submit: if submit.create_submit(new_data): logger.info('[DataPoolConsumer] 存入新提交 for <{} {} {}>'.format( new_data['account'].oj_name, new_data['run_id'], new_data['account'].nickname )) # save the code elif new_data['type'] == DataType.Code: if submit.update_code(new_data): logger.info('[DataPoolConsumer] 更新代码 for <{} {} {}>'.format( new_data['account'].oj_name, new_data['run_id'], new_data['account'].nickname )) else: await DataPool.put(new_data) DataPool.task_done()
async def data_pool_consumer(): """ 爬取的数据消费协程 """ logger.info('[DataPoolConsumer] 数据消费协程开启 ... ') while True: while DataPool.empty(): await gen.sleep(10) new_data = await DataPool.get() # new submit if new_data['type'] == DataType.Submit: if submit.create_submit(new_data): logger.info('[DataPoolConsumer] 存入新提交 for <{} {} {}>'.format( new_data['account'].oj_name, new_data['run_id'], new_data['account'].nickname)) # save the code elif new_data['type'] == DataType.Code: if submit.update_code(new_data): logger.info('[DataPoolConsumer] 更新代码 for <{} {} {}>'.format( new_data['account'].oj_name, new_data['run_id'], new_data['account'].nickname)) else: await DataPool.put(new_data) DataPool.task_done()
def init_all(): logger.info("[AccountInit] 所有非 [NOT_INIT, STOP] 账号已经重置为 NORMAL") session.query(Account)\ .filter(~Account.status.in_([AccountStatus.NOT_INIT, AccountStatus.STOP]))\ .update({Account.status: AccountStatus.NORMAL}, synchronize_session=False) session.commit()
def set_general(self, solved, submitted): self.solved = solved self.submitted = submitted self.save() yield ThreadPool.submit(update_train_rank, self.user_id) logger.info('{} 更新 solved: {} / submitted: {}'.format(self, solved, submitted))
def log_spider_status(): logger.info('[OPEN Spider] {0}'.format(get_all_open_spider()))
def setup_redis(): if not redis.exists(RedisKey.switch): ret = redis.hmset(RedisKey.switch, {oj: 1 for oj in SUPPORT_OJ}) if ret: logger.info('[redis] setup switch key success') log_spider_status()
def set_general(self, solved, submitted): self.solved = solved self.submitted = submitted logger.info('{} 更新 solved: {} / submitted: {}'.format(self, solved, submitted))
from tornado import ioloop from app import make_spider_app from app.api import make_api_app from app.helpers.logger import setup_logger, logger from app.helpers.redis_utils import setup_redis from config import settings if __name__ == '__main__': setup_logger(settings.log_level, settings.log_dir) logger.info('--------------------------------------') logger.info('--------------------------------------') logger.info('[ACM-Spider] 程序启动,初始化中 .........') # 加入 SpiderApp 和 ApiApp 到 io_loop io_loop = ioloop.IOLoop().current() make_spider_app(io_loop) api_app = make_api_app() # 配置 redis setup_redis() # 开始运行 api_app.listen(settings.app_port) io_loop.start()
def log_spider_status(): logger.info('[OPEN Spider] {0}'.format( get_all_open_spider() ))
def push_submit_to_queue(submit_id): logger.info('[redis] push submit #{} to queue'.format(submit_id)) redis.lpush(RedisKey.achieve_mq, json.dumps({'type': 'submit', 'id': submit_id}))
def init_http_client(): try: httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") logger.info('[ACM-Spider] 配置 CurlAsyncHTTPClient 成功') except Exception as ex: logger.error('[ACM-Spider] 配置 CurlAsyncHTTPClient 失败: {}'.format(ex))