Python parse_html示例，webalerts.parse_html Python示例

示例#1

0

显示文件

文件： clien.py 项目： clee704/WebAlerts

    def get_post_ids(self, board_id, page_no):
        """
        Returns the post IDs in order they appear in the markup, which are
        assummed to be in decending order, newest first.

        """
        response = self._browser.get_page('board', board_id=board_id, page_no=page_no)
        doc = parse_html(response.content)
        post_ids = []
        for trow in doc.cssselect('tr.mytr'):
            tcells = trow.findall('td')
            if not tcells:
                self.logger.warning('no td elements in tr.mytr')
                continue
            views = tcells[-1].text_content()
            if views == '-':
                # Deleted post
                continue
            post_id_str = tcells[0].text_content()
            try:
                post_id = int(post_id_str)
            except ValueError:
                self.logger.warning('Post ID is not an integer: %s', post_id_str)
                continue
            post_ids.append(post_id)
        return post_ids

示例#2

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_times(self, mid, fid, tid, date):
     resp = self._browser.get_page("time_list", mid=mid, fid=fid, tid=tid, date=date)
     doc = parse_html(resp.content)
     items = []
     for a in doc.cssselect("ul.time-list > li.time-list > a"):
         try:
             href = a.attrib["href"]
             id = re.search(r"ScreenCd=(\d+)", href, re.I).group(1)
             play_no = int(re.search(r"PlayNum=(\d+)", href, re.I).group(1))
             m = re.search(
                 ur"""
                 (?P<time>\d{2}:\d{2})
                 \s*-\s*
                 (?P<screen_name>[^/]+)/(?P<play_no>\d+)회
                 \s*
                 \(좌석:(?P<current_seats>\d+)/(?P<total_seats>\d+)\)
             """,
                 a.text_content(),
                 re.VERBOSE | re.I,
             )
             screen_name = m.group("screen_name")
             time = m.group("time")
             current_seats = int(m.group("current_seats"))
             total_seats = int(m.group("total_seats"))
         except (IndexError, KeyError, AttributeError, ValueError):
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(PlayState(Play(id, play_no, screen_name, time), current_seats, total_seats))
     return items

示例#3

0

显示文件

文件： clien.py 项目： clee704/WebAlerts

 def get_post(self, board_id, post_id):
     response = self._browser.get_page('post', board_id=board_id, post_id=post_id)
     doc = parse_html(response.content)
     for script in doc.xpath('//script/text()'):
         if u'잘못된접근입니다' in re.sub(r'\s', '', script) and 'history.go(-1)' in script:
             # post is deleted
             return
     try:
         title = doc.cssselect('.board_main > .view_title h4')[0].text_content()
         content_dom = doc.cssselect('.board_main > .view_content #writeContents')[0]
         content = content_dom.text_content()
         content_html = lxml.html.tostring(content_dom, encoding='utf-8').decode(
             'utf-8', 'replace')
         # author_info is only shown when logged in
         author, author_id = (None, None)
         author_link = doc.cssselect('.board_main > .view_head > p.user_info > a[onclick]')
         if author_link:
             author, author_id = self._get_author_info(author_link[0].attrib['onclick'])
         published_str = doc.cssselect('.board_main > .view_head > p.post_info')[0] \
                 .text_content()
         published = self._get_published_time(published_str)
     except (IndexError, AttributeError):
         self.logger.warning('Markup is not as expected', exc_info=True)
     else:
         return Post(response.url, title, content, content_html, author, author_id, published)

示例#4

0

显示文件

文件： clien.py 项目： clee704/WebAlerts

 def _login(self, username, password):
     data = dict(mb_id=username, mb_password=password, url='/')
     url = ClienBrowser.BASE_URL + ClienBrowser.LOGIN_PATH
     self._request(url, method='post', data=data)
     response = self._request(ClienBrowser.BASE_URL + '/')
     doc = parse_html(response.content)
     if not doc.cssselect('#account .uid'):
         raise LoginError()

示例#5

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def _login(self, username, password):
     login_url = CGVBrowser.BASE_URL + CGVBrowser.LOGIN_PATH
     login_page = self._request(login_url)
     doc = parse_html(login_page.content)
     try:
         viewstate = doc.cssselect('[name="__VIEWSTATE"]')[0].attrib["value"]
         eventvalidation = doc.cssselect('[name="__EVENTVALIDATION"]')[0].attrib["value"]
     except (IndexError, AttributeError):
         raise LoginError("Cannot find necessary tokens")
     data = {
         "__VIEWSTATE": viewstate,
         "__EVENTVALIDATION": eventvalidation,
         "Login$tbUserID": username,
         "Login$tbPassword": password,
         "Login$ibLogin.x": 45,
         "Login$ibLogin.y": 37,
     }
     resp = self._request(login_url, method="post", data=data)
     doc = parse_html(resp.content)
     if not doc.cssselect("body"):
         raise LoginError()

示例#6

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_seats(self, tid, date, sid, play_no):
     resp = self._browser.get_page("seat_list", tid=tid, date=date, sid=sid, play_no=play_no)
     doc = parse_html(resp.content)
     items = []
     for td in doc.cssselect("#seat_table td.pointer"):
         try:
             seat = td.attrib["seatname"]
         except KeyError:
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(seat)
     return items

示例#7

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_dates(self, mid, fid, tid):
     resp = self._browser.get_page("date_list", mid=mid, fid=fid, tid=tid)
     doc = parse_html(resp.content)
     items = []
     for a in doc.cssselect("table.month td:not(.disabled) a"):
         try:
             date = re.search(r"PlayYmd=(\d{8})", a.attrib["href"], re.I).group(1)
         except (IndexError, KeyError, AttributeError):
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(date)
     return items

示例#8

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_formats(self, mid):
     resp = self._browser.get_page("format_list", mid=mid)
     doc = parse_html(resp.content)
     items = []
     for a in doc.cssselect("ul.movielist2 > li.movielist2 > a"):
         try:
             id = re.search(r"CgvCode=(\d+)", a.attrib["href"], re.I).group(1)
             name = re.search(r"\(([^)]+)\)", a.text_content().strip(), re.I).group(1)
         except (IndexError, KeyError, AttributeError):
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(Format(id=id, name=name))
     return items

示例#9

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_movies(self):
     resp = self._browser.get_page("movie_list")
     doc = parse_html(resp.content)
     items = []
     for a in doc.cssselect("table.list tbody tr .subject a"):
         try:
             id = re.search(r"MovieIdx=(\d+)", a.attrib["href"], re.I).group(1)
             name = a.text_content().strip()
         except (IndexError, KeyError, AttributeError):
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(Movie(id=id, name=name))
     return items

示例#10

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def get_theaters(self, mid, fid):
     resp = self._browser.get_page("theater_list", mid=mid, fid=fid)
     doc = parse_html(resp.content)
     items = []
     for a in doc.cssselect("#areaTheater ul.theaterlist > li.theaterlist > a"):
         try:
             regex = r"'(\d{{4}})'\s*,\s*'{0}'\s*,\s*'{1}'".format(fid, mid)
             id = re.search(regex, a.attrib["href"], re.I).group(1)
             name = a.text_content()
         except (IndexError, KeyError, AttributeError):
             self.logger.warning("Markup is not as expected", exc_info=True)
         else:
             items.append(Theater(id=id, name=name))
     return items

示例#11

0

显示文件

文件： clien.py 项目： clee704/WebAlerts

 def _login_required(self, response):
     doc = parse_html(response.content)
     scripts = doc.xpath('//script/text()')
     return (len(scripts) >= 2 and
             re.search(ur'alert.*로그인', scripts[0]) and
             re.search(ur'location\.replace', scripts[1]))

示例#12

0

显示文件

文件： cgv.py 项目： clee704/WebAlerts

 def _login_required(self, resp):
     doc = parse_html(resp.content)
     return doc.cssselect(".member_login")