class SdutInitSpider(CrawlSpider): name = 'sdut_init_spider' allowed_domains = ['acm.sdut.edu.cn'] start_urls = ['http://acm.sdut.edu.cn/sdutoj/problem.php'] rules = [ Rule(link(allow=('problem.php\?page=[0-9]+'), unique=True)), Rule(link(allow=('problem.php\?action\S*[0-9]+')), callback='problem_item') ] def problem_item(self, response): sel = Selector(response) item = ProblemItem() item['origin_oj'] = 'sdut' item['problem_id'] = response.url[-4:] item['problem_url'] = response.url item['title'] = sel.xpath('//center/h2/text()').extract()[0] item['description'] = sel.css('.pro_desc').extract()[0] item['input'] = sel.css('.pro_desc').extract()[1] item['output'] = sel.css('.pro_desc').extract()[2] item['time_limit'] = sel.xpath('//a/h5/text()').re( 'T[\S*\s]*s')[0][12:] item['memory_limit'] = sel.xpath('//a/h5/text()').re( 'M[\S*\s]*K')[0][14:] item['sample_input'] = sel.xpath( '//div[@class="data"]/pre').extract()[0] item['sample_output'] = sel.xpath( '//div[@class="data"]/pre').extract()[1] item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
class FzuInitSpider(CrawlSpider): name = 'fzu_init' allowed_domains = ['acm.fzu.edu.cn'] start_urls = ['http://acm.fzu.edu.cn/list.php'] rules = [ Rule(link(allow=('list.php\?vol=[0-9]+'), unique=True)), Rule(link(allow=('problem.php\?pid=[0-9]+')), callback='problem_item') ] def problem_item(self, response): html = response.body.\ replace(' <= ', ' ≤ ').\ replace(' < ', ' < ').\ replace(' > ', ' > ').\ replace(' >= ', ' ≥ ').\ replace(' << ', ' << ').\ replace(' >> ', ' >> ') sel = Selector(text=html) item = ProblemItem() item['origin_oj'] = 'fzu' item['problem_id'] = response.url[-4:] item['problem_url'] = response.url item['title'] = sel.xpath(\ '//div[contains(@class,\ "problem_title")]/b/text()' ).extract()[0][14:].rstrip() item['description'] = \ sel.css('.pro_desc').extract()[0][22:-6].\ replace('<div class="data">', '<pre>').\ replace('</div>', '</pre>') try: item['input'] = sel.css('.pro_desc').extract()[1] except: item['input'] = [] try: item['output'] = sel.css('.pro_desc').extract()[2] except: item['output'] = [] item['time_limit'] = sel.css('.problem_desc').re('T[\S*\s]*c')[0][12:] item['memory_limit'] = sel.css('.problem_desc').re( 'M[\S*\s]*B')[0][15:] item['accept'] = sel.css('.problem_desc').re('Accept:*\s[0-9]+')[0][8:] item['submit'] = sel.css('.problem_desc').re('Submit:*\s[0-9]+')[0][8:] item['sample_input'] = \ sel.css('.data').extract()[-2].\ replace('<div class="data">', '<pre>').\ replace('</div>', '</pre>') item['sample_output'] = \ sel.css('.data').extract()[-1].\ replace('<div class="data">', '<pre>').\ replace('</div>', '</pre>') item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
class PojInitSpider(CrawlSpider): name = 'poj_init' allowed_domains = ['poj.org'] start_urls = ['http://poj.org/problemlist'] download_delay = 5 rules = [ Rule(link(allow=('problemlist\?volume=[0-9]+'), unique=True)), Rule(link(allow=('problem\?id=[0-9]+')), callback='problem_item') ] def problem_item(self, response): html = response.body.\ replace('<=', ' ≤ ').\ replace(' < ', ' < ').\ replace(' > ', ' > ').\ replace('>=', ' ≥ ') sel = Selector(text=html) item = ProblemItem() print response item['oj'] = 'poj' item['problem_id'] = response.url[-4:] item['problem_url'] = response.url item['title'] = sel.css('.ptt').xpath('./text()').extract()[0] item['description'] = sel.css('.ptx').extract()[0] item['input'] = sel.css('.ptx').extract()[1] item['output'] = sel.css('.ptx').extract()[2] try: item['time_limit'] = sel.css('.plm').re( 'Case\sT[\S*\s]*MS')[0][21:] except: item['time_limit'] = sel.css('.plm').re('T[\S*\s]*MS')[0][16:] item['memory_limit'] = sel.css('.plm').re('Me[\S*\s]*K')[0] item['sample_input'] = sel.css('.sio').extract()[0] item['sample_output'] = sel.css('.sio').extract()[1] item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
class PojSolvedSpider(CrawlSpider): name = 'poj_solved_spider' allowed_domains = ['poj.org'] status_url = 'http://poj.org/status' download_delay = 10 rules = [ Rule(link(allow=(RULE_REGEX), restrict_xpaths=RESTRICT_XPATHS, unique=True), follow=True, callback='parse_item') ] solved = {} def __init__(self, username, *args, **kwargs): super(PojSolvedSpider, self).__init__(*args, **kwargs) self.origin_oj = 'poj' self.username = username self.start_urls = [ '{0}?{1}'.format(self.status_url, urlencode(dict(user_id=username, result=0))) ] def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): sel = Selector(response) items = sel.xpath(TABLE_TR_XPATH) for item in items: nickname = item.xpath(NICKNAME_XPATH).extract()[0] problem_id = item.xpath(PROBLEM_ID_XPATH).extract()[0].strip() submit_time = item.xpath(SUBMIT_TIME_XPATH).extract()[0].split( ' ')[0] if nickname == self.username: self.solved[problem_id] = submit_time if not items: yield AccountItem(**dict(origin_oj=self.origin_oj, username=self.username, solved=self.solved)) raise CloseSpider('Crawl finished') return
class HduInitSpider(CrawlSpider): name = 'hdu_init_spider' allowed_domains = ['acm.hdu.edu.cn'] problem_base_url = 'http://acm.hdu.edu.cn/showproblem.php?pid=' start_urls = [ 'http://acm.hdu.edu.cn/listproblem.php' ] rules = [ Rule( link( allow=('listproblem.php\?vol=[0-9]+'), unique=True, ), callback='problem_list' ) ] def problem_list(self, response): sel = Selector(response) problems = sel.xpath('//script')[4].re('\(.+?\)') for problem in problems: problem_id = problem.split(',')[1] yield Request( self.problem_base_url + problem_id, callback=self.problem_item) def problem_item(self, response): sel = Selector(response) item = ProblemItem() item['origin_oj'] = 'hdu' item['problem_id'] = response.url[-4:] item['problem_url'] = response.url item['title'] = sel.xpath('//h1/text()').extract()[0] item['description'] = sel.css('.panel_content').extract()[0] item['input'] = sel.css('.panel_content').extract()[1] item['output'] = sel.css('.panel_content').extract()[2] item['time_limit'] = \ sel.xpath('//b/span/text()').re('T[\S*\s]*S')[0][12:] item['memory_limit'] = \ sel.xpath('//b/span/text()').re('Me[\S*\s]*K')[0][14:] item['sample_input'] = sel.xpath('//pre').extract()[0] item['sample_output'] = sel.xpath('//pre').extract()[1] item['update_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") return item
class HduSolvedSpider(CrawlSpider): name = 'hdu_solved_spider' allowed_domains = ['acm.hdu.edu.cn'] status_url = 'http://acm.hdu.edu.cn/status.php' rules = [Rule(link(allow=(RULE_REGEX), restrict_xpaths=RESTRICT_XPATHS, unique=True), follow=True, callback='parse_item')] solved = {} def __init__(self, username, *args, **kwargs): super(HduSolvedSpider, self).__init__(*args, **kwargs) self.origin_oj = 'hdu' self.username = username self.start_urls = [ '{0}?{1}'.format( self.status_url, urlencode(dict(user=username, status=5)))] def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): sel = Selector(response) items = sel.xpath(TABLE_TR_XPATH) for item in items: problem_id = item.xpath( PROBLEM_ID_XPATH).extract()[0].strip() submit_time = item.xpath( SUBMIT_TIME_XPATH).extract()[0].split(' ')[0] self.solved[problem_id] = submit_time if not sel.re(RULE_REGEX): yield AccountItem(**dict( origin_oj=self.origin_oj, username=self.username, solved=self.solved )) raise CloseSpider('Crawl finished') return
class PojSubmitSpider(CrawlSpider): name = 'poj_submit' allowed_domains = ['poj.org'] login_url = 'http://poj.org/login' submit_url = 'http://poj.org/submit' login_verify_url = 'http://poj.org/loginlog' source = \ 'I2luY2x1ZGUgPHN0ZGlvLmg+CgppbnQgbWFpbigpCnsKICAgIGludCBhLGI7CiAgICBzY2FuZigiJWQgJWQiLCZhLCAmYik7CiAgICBwcmludGYoIiVkXG4iLGErYik7CiAgICByZXR1cm4gMDsKfQ==' start_urls = ["http://poj.org/status"] download_delay = 0.5 rules = [ Rule(link(allow=('/status\?top=[0-9]+'), deny=('status\?bottom=[0-9]+')), follow=True, callback='parse_start_url') ] is_login = False def __init__(self, solution_id='None', problem_id='1000', language='g++', source=None, username='******', password='******', *args, **kwargs): super(PojSubmitSpider, self).__init__(*args, **kwargs) self.solution_id = solution_id self.username = username self.password = password self.problem_id = problem_id self.language = language if source is not None: self.source = source def start_requests(self): return [ FormRequest( self.login_url, formdata={ 'user_id1': self.username, 'password1': self.password, 'B1': 'login', }, callback=self.after_login, ) ] def after_login(self, response): return [Request(self.login_verify_url, callback=self.login_verify)] def login_verify(self, response): if response.url == self.login_verify_url: self.is_login = True self.login_time = time.mktime(time.strptime(\ response.headers['Date'], \ '%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60) time.sleep(1) return [ FormRequest(self.submit_url, formdata={ 'problem_id': self.problem_id, 'language': LANGUAGE.get(self.language, '0'), 'source': self.source, 'submit': 'Submit', 'encoded': '1' }, callback=self.after_submit, dont_filter=True) ] else: return Request(self.start_urls[0], callback=self.parse_start_url) def after_submit(self, response): time.sleep(3) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_start_url(self, response): sel = Selector(response) item = SolutionItem() item['oj'] = 'poj' item['problem_id'] = self.problem_id item['language'] = self.language item['solution_id'] = self.solution_id if self.is_login: for tr in sel.xpath('//table')[-1].xpath('.//tr')[1:]: user = tr.xpath('.//td/a/text()').extract()[0] _submit_time = tr.xpath('.//td/text()').extract()[-1] if user == self.username: item['submit_time'] = _submit_time item['run_id'] = tr.xpath('.//td/text()').extract()[0] try: item['memory'] = \ tr.xpath('.//td')[4].xpath('./text()').extract()[0] item['time'] = \ tr.xpath('.//td')[5].xpath('./text()').extract()[0] except: pass item['code_length'] = tr.xpath( './/td/text()').extract()[-2] item['result'] = tr.xpath('.//td').xpath( './/font/text()').extract()[0] self._rules = [] return item else: item['result'] = 'Submit Error' self._rules = [] return item
class FzuSubmitSpider(CrawlSpider): name = 'fzu_submit_spider' allowed_domains = ['acm.fzu.edu.cn'] login_url = 'http://acm.fzu.edu.cn/login.php?act=1&dir=' submit_url = 'http://acm.fzu.edu.cn/submit.php?act=5' login_verify_url = 'http://acm.fzu.edu.cn/mail.php' source = 'I2luY2x1ZGUgPHN0ZGlvLmg+CgppbnQgbWFpbigpCnsKI\ CAgIGludCBhLGI7CiAgICBzY2FuZigiJWQgJWQiLCZhLCA\ mYik7CiAgICBwcmludGYoIiVkXG4iLGErYik7CiAgICByZXR1cm4gMDsKfQ==' start_urls = [ "http://acm.fzu.edu.cn/log.php" ] download_delay = 0.5 is_login = False rules = [ Rule( link( allow=('log.php\?&page=[0-9]+'), deny=('log.php\?&page=1$') ), follow=True, callback='parse_start_url') ] def __init__(self, solution_id=1, problem_id='1000', language='g++', source=None, username='******', password='******', *args, **kwargs): super(FzuSubmitSpider, self).__init__(*args, **kwargs) self.solution_id = solution_id self.problem_id = problem_id self.language = language self.username = username self.password = password if source is not None: self.source = source def start_requests(self): return [FormRequest( self.login_url, formdata={ 'uname': self.username, 'upassword': self.password, 'submit': 'Submit', }, callback=self.after_login, dont_filter=True )] def after_login(self, response): return [Request( self.login_verify_url, callback=self.login_verify )] def login_verify(self, response): if re.search('Write New Mail', response.body): self.is_login = True self.login_time = time.mktime(time.strptime( response.headers['Date'], '%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60) time.sleep(1) return [FormRequest( self.submit_url, formdata={ 'pid': self.problem_id, 'lang': LANGUAGE.get(self.language, '0'), 'code': b64decode(self.source), 'submit': 'Submit', }, callback=self.after_submit, dont_filter=True )] else: return Request(self.start_urls[0], callback=self.parse_start_url) def after_submit(self, response): time.sleep(10) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_start_url(self, response): sel = Selector(response) item = SolutionItem() item['solution_id'] = self.solution_id item['origin_oj'] = 'fzu' item['problem_id'] = self.problem_id item['language'] = self.language if self.is_login: for tr in sel.xpath('//table/tr')[1:]: user = tr.xpath('.//td/a/text()').extract()[-1] _submit_time = tr.xpath('.//td/text()').extract()[1] submit_time = time.mktime( time.strptime(_submit_time, '%Y-%m-%d %H:%M:%S')) if submit_time > self.login_time and \ user == self.username: item['submit_time'] = _submit_time item['run_id'] = tr.xpath('.//td/text()').extract()[0] try: item['memory'] = \ tr.xpath('.//td')[5].xpath('./text()').extract()[0] item['time'] = \ tr.xpath('.//td')[6].xpath('./text()').extract()[0] except: pass item['code_length'] = tr.xpath( './/td/text()').extract()[-1] try: item['result'] = tr.xpath( './/td/font/text()').extract()[0] except: item['result'] = tr.xpath( './/td/font/a/text()').extract()[0] self._rules = [] return item else: item['result'] = 'Submit Error' self._rules = [] return item
class HduSubmitSpider(CrawlSpider): name = 'hdu_submit_spider' allowed_domains = ['acm.hdu.edu.cn'] login_url = 'http://acm.hdu.edu.cn/userloginex.php?action=login' submit_url = 'http://acm.hdu.edu.cn/submit.php?action=submit' login_verify_url = 'http://acm.hdu.edu.cn/control_panel.php' source = 'I2luY2x1ZGUgPHN0ZGlvLmg+CgppbnQgbWFpbigpCnsK\ ICAgIGludCBhLGI7CiAgICBzY2FuZigiJWQgJWQiLCZhL\ CAmYik7CiAgICBwcmludGYoIiVkXG4iLGErYik7CiAgICByZXR1cm4gMDsKfQ==' start_urls = [ 'http://acm.hdu.edu.cn/status.php' ] download_delay = 0.5 is_login = False rules = [ Rule(link( allow=('/status.php\?first\S*status')), follow=True, callback='parse_start_url') ] def __init__(self, solution_id=1, problem_id='1000', language='g++', source=None, username='******', password='******', *args, **kwargs): super(HduSubmitSpider, self).__init__(*args, **kwargs) self.solution_id = solution_id self.username = username self.password = password self.problem_id = problem_id self.language = language if source is not None: self.source = source def start_requests(self): return [FormRequest( self.login_url, formdata={ 'username': self.username, 'userpass': self.password, 'login': '******', }, callback=self.after_login, dont_filter=True )] def after_login(self, response): if not re.search(r'No such user or wrong password.', response.body): self.is_login = True self.login_time = time.mktime(time.strptime( response.headers['Date'], '%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60) time.sleep(1) return [FormRequest( self.submit_url, formdata={ 'problemid': self.problem_id, 'language': LANGUAGE.get(self.language, '0'), 'usercode': b64decode(self.source), 'check': '0' }, callback=self.after_submit, dont_filter=True )] else: return Request(self.start_urls[0], callback=self.parse_start_url) def after_submit(self, response): time.sleep(3) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_start_url(self, response): sel = Selector(response) item = SolutionItem() item['solution_id'] = self.solution_id item['origin_oj'] = 'hdu' item['problem_id'] = self.problem_id item['language'] = self.language if self.is_login: for tr in sel.xpath('//table[@class="table_text"]/tr')[1:]: user = tr.xpath('.//td/a/text()').extract()[-1] _submit_time = tr.xpath('.//td/text()').extract()[1] submit_time = time.mktime( time.strptime(_submit_time, '%Y-%m-%d %H:%M:%S')) if submit_time > self.login_time and \ user == self.username: item['submit_time'] = _submit_time item['run_id'] = tr.xpath('.//td/text()').extract()[0] try: item['memory'] = \ tr.xpath('.//td')[4].xpath('./text()').extract()[0] item['time'] = \ tr.xpath('.//td')[5].xpath('./text()').extract()[0] except: pass item['code_length'] = tr.xpath( './/td/a/text()').extract()[-2] item['result'] = tr.xpath( './/td').xpath('.//font/text()').extract()[0] self._rules = [] return item else: item['result'] = 'Submit Error' self._rules = [] return item
class SdutSubmitSpider(CrawlSpider): name = 'sdut_submit' allowed_domains = ['acm.sdut.edu.cn'] login_url = 'http://acm.sdut.edu.cn/sdutoj/login.php?action=login' submit_url = 'http://acm.sdut.edu.cn/sdutoj/submit.php?action=submit' source = \ 'I2luY2x1ZGUgPHN0ZGlvLmg+CgppbnQgbWFpbigpCnsKICAgIGludCBhLGI7CiAgICBzY2FuZigiJWQgJWQiLCZhLCAmYik7CiAgICBwcmludGYoIiVkXG4iLGErYik7CiAgICByZXR1cm4gMDsKfQ==' start_urls = [ "http://acm.sdut.edu.cn/sdutoj/status.php" ] download_delay = 0.5 rules = [ Rule( link( allow=('status.php\?page=[0-9]+\S*'), \ deny=('status.php\?page=1&\S*'), unique=True ), follow=True, callback='parse_start_url') ] is_login = False def __init__(self, solution_id=1, problem_id = '1000', language = 'g++', source = None, username = '******', password = '******', *args, **kwargs): super(SdutSubmitSpider, self).__init__(*args, **kwargs) self.solution_id = solution_id self.problem_id = problem_id self.language = LANGUAGE.get(language, 'g++') self.username = username self.password = password if source is not None: self.source = source def start_requests(self): return [FormRequest(self.login_url, formdata = { 'username': self.username, 'password': self.password, 'submit': '++%E7%99%BB+%E5%BD%95++' }, callback = self.after_login, dont_filter = True )] def after_login(self, response): if not re.search(r'用户名或密码错误!', response.body): self.is_login = True self.login_time = time.mktime(time.strptime(\ response.headers['Date'], \ '%a, %d %b %Y %H:%M:%S %Z')) + (8 * 60 * 60) time.sleep(1) return [FormRequest(self.submit_url, formdata = { 'Sub[problem_id]': self.problem_id, 'Sub[pro_lang]': self.language, 'Sub[code]': b64decode(self.source) }, callback = self.after_submit, dont_filter = True )] else: return Request(self.start_urls[0], callback = self.parse_start_url) def after_submit(self, response): time.sleep(3) for url in self.start_urls: yield self.make_requests_from_url(url) def parse_start_url(self, response): sel = Selector(response) item = SolutionItem() item['solution_id'] = self.solution_id item['origin_oj'] = 'sdut' item['problem_id'] = self.problem_id item['language'] = self.language if self.is_login: for tr in sel.xpath('//table[@class="tablelist"]/tr')[1:]: user = tr.xpath('.//td/a/xmp/text()').extract()[0] _submit_time = tr.xpath('.//td/text()').extract()[-1] submit_time = time.mktime(\ time.strptime(_submit_time, '%Y-%m-%d %H:%M:%S')) if submit_time > self.login_time and \ user == self.username: item['submit_time'] = _submit_time item['run_id'] = tr.xpath('.//td/text()').extract()[0] try: item['memory'] = \ tr.xpath('.//td')[5].xpath('./text()').extract()[0] item['time'] = \ tr.xpath('.//td')[4].xpath('./text()').extract()[0] except: pass item['code_length'] = tr.xpath('.//td/text()').\ extract()[-2] item['result'] = tr.xpath('.//td').\ xpath('.//font/text()').extract()[0] self._rules = [] return item else: item['result'] = 'Submit Error' self._rules = [] return item