示例#1
0
文件: wust.py 项目: KinKir/spider
    def problem_parse(self, response, pid, url):
        problem = Problem()
        problem.remote_id = pid
        problem.remote_url = url
        problem.remote_oj = 'WUST'
        if response is None:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code
        if status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        if re.search('Problem is not Available', website_data):
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem
        match_groups = re.search(r'[\d]{4,}: ([\s\S]*?)</h2>', website_data)
        if match_groups:
            problem.title = match_groups.group(1)
        match_groups = re.search(r'(\d* Sec)', website_data)
        if match_groups:
            problem.time_limit = match_groups.group(1)
        match_groups = re.search(r'(\d* MB)', website_data)
        if match_groups:
            problem.memory_limit = match_groups.group(1)
        problem.special_judge = re.search(r'class=red>Special Judge</span>',
                                          website_data) is not None

        soup = BeautifulSoup(website_data, 'lxml')

        problem.html = ''
        for tag in soup.find('div', attrs={'class': 'rich_text'}).children:
            if type(tag) == element.Tag:
                if tag.name in ['h2', 'div']:
                    if not tag.get('class'):
                        tag['class'] = ()
                    if tag.name == 'h2':
                        if tag.div:
                            tag.div.decompose()
                        if tag.img:
                            tag.img.decompose()
                        tag['style'] = HtmlTag.TagStyle.TITLE.value
                        tag['class'] += (HtmlTag.TagDesc.TITLE.value, )
                        problem.html += str(
                            HtmlTag.update_tag(
                                tag,
                                self._static_prefix,
                                update_style=HtmlTag.TagStyle.TITLE.value))

                    else:
                        tag['style'] = HtmlTag.TagStyle.CONTENT.value
                        tag['class'] += (HtmlTag.TagDesc.CONTENT.value, )
                        problem.html += str(
                            HtmlTag.update_tag(
                                tag,
                                self._static_prefix,
                                update_style=HtmlTag.TagStyle.CONTENT.value))
        problem.html = '<body>' + problem.html + '</body>'
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem
示例#2
0
    def problem_parse(self, response, pid, url):
        problem = Problem()

        problem.remote_id = pid
        problem.remote_url = url
        problem.remote_oj = 'FZU'

        if response is None:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code

        if status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        if re.search('No Such Problem!', website_data):
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem
        soup = BeautifulSoup(website_data, 'lxml')
        match_groups = re.search(r'<b> Problem [\d]* ([\s\S]*?)</b>', website_data)
        if match_groups:
            problem.title = match_groups.group(1)
        match_groups = re.search(r'(\d* mSec)', website_data)
        if match_groups:
            problem.time_limit = match_groups.group(1)
        match_groups = re.search(r'(\d* KB)', website_data)
        if match_groups:
            problem.memory_limit = match_groups.group(1)

        problem.special_judge = re.search(r'<font color="blue">Special Judge</font>', website_data) is not None
        problem.html = ''
        for tag in soup.find('div', attrs={'class': 'problem_content'}).children:
            if tag.name == 'h2':
                if tag.img:
                    tag.img.decompose()
                if not tag.get('class'):
                    tag['class'] = (HtmlTag.TagDesc.TITLE.value,)
                else:
                    tag['class'] += (HtmlTag.TagDesc.TITLE.value,)
                tag['style'] = HtmlTag.TagStyle.TITLE.value
            if tag.name == 'div':
                if not tag.get('class'):
                    tag['class'] = (HtmlTag.TagDesc.CONTENT.value,)
                else:
                    tag['class'] += (HtmlTag.TagDesc.CONTENT.value,)
                tag['style'] = HtmlTag.TagStyle.CONTENT.value
            problem.html += str(HtmlTag.update_tag(tag, self._static_prefix))
        problem.html = '<body>' + self._global_style + problem.html + '</body>'
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem
示例#3
0
文件: hdu.py 项目: KinKir/spider
    def problem_parse(self, response, pid, url):
        problem = Problem()

        problem.remote_id = pid
        problem.remote_url = url
        problem.remote_oj = 'HDU'

        if response is None:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code

        if status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        if re.search('No such problem', website_data):
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem
        soup = BeautifulSoup(website_data, 'lxml')

        match_groups = re.search(r'color:#1A5CC8\'>([\s\S]*?)</h1>', website_data)
        if match_groups:
            problem.title = match_groups.group(1)
        match_groups = re.search(r'(\d* MS)', website_data)
        if match_groups:
            problem.time_limit = match_groups.group(1)
        match_groups = re.search(r'/(\d* K)', website_data)
        if match_groups:
            problem.memory_limit = match_groups.group(1)
        problem.special_judge = re.search(r'color=red>Special Judge</font>', website_data) is not None

        problem.html = ''
        for tag in soup.find('h1').parent.children:
            if type(tag) == element.Tag and tag.get('class') and set(tag['class']).intersection({'panel_title',
                                                                                                 'panel_content',
                                                                                                 'panel_bottom'}):
                if set(tag['class']).intersection({'panel_title', }):
                    tag['class'] += (HtmlTag.TagDesc.TITLE.value,)
                    tag['style'] = HtmlTag.TagStyle.TITLE.value
                else:
                    tag['class'] += (HtmlTag.TagDesc.CONTENT.value,)
                    tag['style'] = HtmlTag.TagStyle.CONTENT.value
                problem.html += str(HtmlTag.update_tag(tag, self._static_prefix))
        problem.html += self._script
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem
示例#4
0
    def get_problem(self, *args, **kwargs):
        url = 'http://poj.org/problem?id=' + str(kwargs['pid'])
        problem = Problem()
        try:
            res = self.request.get(url=url)
            website_data = res.text
            problem.remote_id = kwargs['pid']
            problem.remote_url = url
            problem.remote_oj = 'POJ'
            # print('probid', problem.remote_id)
            problem.title = re.search(r'ptt" lang="en-US">([\s\S]*?)</div>', website_data).group(1)
            problem.time_limit = re.search(r'(\d*MS)', website_data).group(1)
            problem.memory_limit = re.search(r'Memory Limit:</b> ([\s\S]*?)</td>', website_data).group(1)

            problem.special_judge = re.search(r'red;">Special Judge</td>', website_data) is not None
            problem.description = re.search(r'>Description</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>',
                                            website_data).group(1)
            problem.input = re.search(r'>Input</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>', website_data).group(1)
            problem.output = re.search(r'>Output</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>', website_data).group(1)
            match_group = re.search(r'>Sample Input</p>([\s\S]*?)<p class', website_data)
            input_data = ''
            if match_group:
                input_data = re.search('"sio">([\s\S]*?)</pre>', match_group.group(1)).group(1)

            output_data = ''
            match_group = re.search(r'>Sample Output</p>([\s\S]*?)<p class', website_data)
            if match_group:
                output_data = re.search('"sio">([\s\S]*?)</pre>', match_group.group(1)).group(1)
            problem.sample = [
                {'input': input_data,
                 'output': output_data}]
            # match_group = re.search(r'>Author</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data)
            # if match_group:
            #    problem.author = match_group.group(1)

            match_group = re.search(r'>Hint</p>[\s\S]*?"en-US">([\s\S]*?)</div>', website_data)
            if match_group:
                problem.hint = match_group.group(1)
            match_group = re.search(r'>Source</p>[\s\S]*?"en-US">([\s\S]*?)</div>', website_data)
            if match_group:
                problem.source = match_group.group(1)
            return problem
        except:
            pass
        return None
示例#5
0
    def get_problem(self, *args, **kwargs):
        url = 'http://acm.hdu.edu.cn/showproblem.php?pid=' + str(kwargs['pid'])
        problem = Problem()
        try:
            website_data = self.request.get(url)
            self.cookies = website_data.cookies
            problem.remote_id = kwargs['pid']
            problem.remote_url = url
            problem.remote_oj = 'HDU'
            problem.title = re.search(r'color:#1A5CC8\'>([\s\S]*?)</h1>', website_data.text).group(1)
            problem.time_limit = re.search(r'(\d* MS)', website_data.text).group(1)
            problem.memory_limit = re.search(r'/(\d* K)', website_data.text).group(1)
            problem.special_judge = re.search(r'color=red>Special Judge</font>', website_data.text) is not None
            problem.description = re.search(r'>Problem Description</div>[\s\S]*?panel_content>([\s\S]*?)</div>',
                                            website_data.text).group(1)
            problem.input = re.search(r'>Input</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text).group(1)
            problem.output = re.search(r'>Output</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text).group(
                1)
            match_group = re.search(r'>Sample Input</div>[\s\S]*?panel_content>([\s\S]*?)</div', website_data.text)
            input_data = ''

            if match_group:
                input_data = re.search(r'(<pre><div[\s\S]*?>)?([\s\S]*)', match_group.group(1)).group(2)

            output_data = ''
            match_group = re.search(r'>Sample Output</div>[\s\S]*?panel_content>([\s\S]*?)</div', website_data.text)
            if match_group:
                output_data = re.search(r'(<pre><div[\s\S]*?>)?([\s\S]*)', match_group.group(1)).group(2)
                if re.search('<div', output_data):
                    output_data = re.search(r'([\s\S]*?)<div', output_data).group(1)
            problem.sample = [
                {'input': input_data,
                 'output': output_data}]

            match_group = re.search(r'>Author</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text)
            if match_group:
                problem.author = match_group.group(1)
            match_group = re.search(r'<i>Hint</i>[\s\S]*?/div>[\s]*([\s\S]+?)</div>', website_data.text)
            if match_group:
                problem.hint = match_group.group(1)
        except:
            return None
        return problem
示例#6
0
文件: aizu.py 项目: KinKir/spider
    def problem_parse(self, response, pid, url):
        problem = Problem()

        problem.remote_id = pid
        problem.remote_oj = 'Aizu'
        problem.remote_url = url
        if response is None:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code
        if status_code in [401, 404]:
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem
        elif status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        site_data = json.loads(website_data)
        soup = BeautifulSoup(site_data.get('html'), 'lxml')
        problem.title = str(soup.find('h1').get_text())
        problem.time_limit = str(site_data.get('time_limit')) + ' sec'
        problem.memory_limit = str(site_data.get('memory_limit')) + ' KB'
        problem.special_judge = False

        problem.html = ''

        for tag in soup.body:
            if type(tag) == element.Tag and tag.name in [
                    'p', 'h2', 'pre', 'center'
            ]:
                if not tag.get('class'):
                    tag['class'] = ()
                if tag.name == 'h2':
                    tag['style'] = HtmlTag.TagStyle.TITLE.value
                    tag['class'] += (HtmlTag.TagDesc.TITLE.value, )
                else:
                    tag['style'] = HtmlTag.TagStyle.CONTENT.value
                    tag['class'] += (HtmlTag.TagDesc.CONTENT.value, )
                problem.html += str(
                    HtmlTag.update_tag(tag, self._static_prefix))
        problem.html += self._script
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem
示例#7
0
文件: poj.py 项目: KinKir/spider
    def problem_parse(self, response, pid, url):
        problem = Problem()

        problem.remote_id = pid
        problem.remote_url = url
        problem.remote_oj = 'POJ'
        if response is None:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code
        if status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        if re.search('Can not find problem', website_data):
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem
        soup = BeautifulSoup(website_data, 'lxml')

        match_groups = re.search(r'ptt" lang="en-US">([\s\S]*?)</div>', website_data)
        if match_groups:
            problem.title = match_groups.group(1)
        match_groups = re.search(r'(\d*MS)', website_data)
        if match_groups:
            problem.time_limit = match_groups.group(1)
        match_groups = re.search(r'Memory Limit:</b> ([\s\S]*?)</td>', website_data)
        if match_groups:
            problem.memory_limit = match_groups.group(1)
        problem.special_judge = re.search(r'red;">Special Judge</td>', website_data) is not None
        problem.html = ''
        for tag in soup.find('div', attrs={'class': 'ptt'}).next_siblings:
            if type(tag) == Tag and set(tag.get('class')).intersection({'ptx', 'pst', 'sio'}):
                if set(tag['class']).intersection({'pst', }):
                    tag['style'] = HtmlTag.TagStyle.TITLE.value

                    tag['class'] += (HtmlTag.TagDesc.TITLE.value,)
                else:
                    tag['style'] = HtmlTag.TagStyle.CONTENT.value
                    tag['class'] += (HtmlTag.TagDesc.CONTENT.value,)
                problem.html += str(HtmlTag.update_tag(tag, self._static_prefix))
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem
示例#8
0
    def problem_parse(self, response, pid, url):
        problem = Problem()

        problem.remote_id = pid
        problem.remote_url = url
        problem.remote_oj = 'ZOJ'
        if not response:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        website_data = response.text
        status_code = response.status_code

        if status_code != 200:
            problem.status = Problem.Status.STATUS_SUBMIT_FAILED
            return problem
        if re.search('No such problem', website_data):
            problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST
            return problem

        soup = BeautifulSoup(website_data, 'lxml')
        problem.title = str(
            soup.find('span', attrs={
                'class': 'bigProblemTitle'
            }).get_text())
        match_groups = re.search(r'(\d* Second)', website_data)
        if match_groups:
            problem.time_limit = match_groups.group(1)
        match_groups = re.search(r'(\d* KB)', website_data)
        if match_groups:
            problem.memory_limit = match_groups.group(1)
        problem.special_judge = re.search(
            r'<font color="blue">Special Judge</font>',
            website_data) is not None
        problem.html = ''
        problem.html += self._script
        raw_html = soup.find('div', attrs={'id': 'content_body'})
        for tag in raw_html.children:
            if type(tag) == element.NavigableString:
                problem.html += str(tag)
            if type(tag) == element.Tag and tag.name not in ['center', 'hr']:
                if tag.name == 'a' and tag.get(
                        'href') == '/onlinejudge/faq.do#sample':
                    continue
                if tag.name == 'h2':
                    tag['style'] = HtmlTag.TagStyle.TITLE.value
                    if tag.get('class'):
                        tag['class'] += (HtmlTag.TagDesc.TITLE.value, )
                    else:
                        tag['class'] = (HtmlTag.TagDesc.TITLE.value, )
                elif tag.name == 'p' and tag.b and tag.b.string in [
                        'Input', 'Output', 'Sample Input', 'Sample Output'
                ]:
                    tag.b['style'] = HtmlTag.TagStyle.TITLE.value
                    if tag.get('class'):
                        tag.b['class'] += (HtmlTag.TagDesc.TITLE.value, )
                    else:
                        tag.b['class'] = (HtmlTag.TagDesc.TITLE.value, )
                else:
                    tag['style'] = HtmlTag.TagStyle.CONTENT.value
                    if tag.get('class'):
                        tag['class'] += (HtmlTag.TagDesc.CONTENT.value, )
                    else:
                        tag['class'] = (HtmlTag.TagDesc.CONTENT.value, )
                    HtmlTag.update_tag(tag, self._static_prefix)
                problem.html += str(tag)
        problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS
        return problem