def get_post_ids(self, board_id, page_no): """ Returns the post IDs in order they appear in the markup, which are assummed to be in decending order, newest first. """ response = self._browser.get_page('board', board_id=board_id, page_no=page_no) doc = parse_html(response.content) post_ids = [] for trow in doc.cssselect('tr.mytr'): tcells = trow.findall('td') if not tcells: self.logger.warning('no td elements in tr.mytr') continue views = tcells[-1].text_content() if views == '-': # Deleted post continue post_id_str = tcells[0].text_content() try: post_id = int(post_id_str) except ValueError: self.logger.warning('Post ID is not an integer: %s', post_id_str) continue post_ids.append(post_id) return post_ids
def get_times(self, mid, fid, tid, date): resp = self._browser.get_page("time_list", mid=mid, fid=fid, tid=tid, date=date) doc = parse_html(resp.content) items = [] for a in doc.cssselect("ul.time-list > li.time-list > a"): try: href = a.attrib["href"] id = re.search(r"ScreenCd=(\d+)", href, re.I).group(1) play_no = int(re.search(r"PlayNum=(\d+)", href, re.I).group(1)) m = re.search( ur""" (?P<time>\d{2}:\d{2}) \s*-\s* (?P<screen_name>[^/]+)/(?P<play_no>\d+)회 \s* \(좌석:(?P<current_seats>\d+)/(?P<total_seats>\d+)\) """, a.text_content(), re.VERBOSE | re.I, ) screen_name = m.group("screen_name") time = m.group("time") current_seats = int(m.group("current_seats")) total_seats = int(m.group("total_seats")) except (IndexError, KeyError, AttributeError, ValueError): self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(PlayState(Play(id, play_no, screen_name, time), current_seats, total_seats)) return items
def get_post(self, board_id, post_id): response = self._browser.get_page('post', board_id=board_id, post_id=post_id) doc = parse_html(response.content) for script in doc.xpath('//script/text()'): if u'잘못된접근입니다' in re.sub(r'\s', '', script) and 'history.go(-1)' in script: # post is deleted return try: title = doc.cssselect('.board_main > .view_title h4')[0].text_content() content_dom = doc.cssselect('.board_main > .view_content #writeContents')[0] content = content_dom.text_content() content_html = lxml.html.tostring(content_dom, encoding='utf-8').decode( 'utf-8', 'replace') # author_info is only shown when logged in author, author_id = (None, None) author_link = doc.cssselect('.board_main > .view_head > p.user_info > a[onclick]') if author_link: author, author_id = self._get_author_info(author_link[0].attrib['onclick']) published_str = doc.cssselect('.board_main > .view_head > p.post_info')[0] \ .text_content() published = self._get_published_time(published_str) except (IndexError, AttributeError): self.logger.warning('Markup is not as expected', exc_info=True) else: return Post(response.url, title, content, content_html, author, author_id, published)
def _login(self, username, password): data = dict(mb_id=username, mb_password=password, url='/') url = ClienBrowser.BASE_URL + ClienBrowser.LOGIN_PATH self._request(url, method='post', data=data) response = self._request(ClienBrowser.BASE_URL + '/') doc = parse_html(response.content) if not doc.cssselect('#account .uid'): raise LoginError()
def _login(self, username, password): login_url = CGVBrowser.BASE_URL + CGVBrowser.LOGIN_PATH login_page = self._request(login_url) doc = parse_html(login_page.content) try: viewstate = doc.cssselect('[name="__VIEWSTATE"]')[0].attrib["value"] eventvalidation = doc.cssselect('[name="__EVENTVALIDATION"]')[0].attrib["value"] except (IndexError, AttributeError): raise LoginError("Cannot find necessary tokens") data = { "__VIEWSTATE": viewstate, "__EVENTVALIDATION": eventvalidation, "Login$tbUserID": username, "Login$tbPassword": password, "Login$ibLogin.x": 45, "Login$ibLogin.y": 37, } resp = self._request(login_url, method="post", data=data) doc = parse_html(resp.content) if not doc.cssselect("body"): raise LoginError()
def get_seats(self, tid, date, sid, play_no): resp = self._browser.get_page("seat_list", tid=tid, date=date, sid=sid, play_no=play_no) doc = parse_html(resp.content) items = [] for td in doc.cssselect("#seat_table td.pointer"): try: seat = td.attrib["seatname"] except KeyError: self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(seat) return items
def get_dates(self, mid, fid, tid): resp = self._browser.get_page("date_list", mid=mid, fid=fid, tid=tid) doc = parse_html(resp.content) items = [] for a in doc.cssselect("table.month td:not(.disabled) a"): try: date = re.search(r"PlayYmd=(\d{8})", a.attrib["href"], re.I).group(1) except (IndexError, KeyError, AttributeError): self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(date) return items
def get_formats(self, mid): resp = self._browser.get_page("format_list", mid=mid) doc = parse_html(resp.content) items = [] for a in doc.cssselect("ul.movielist2 > li.movielist2 > a"): try: id = re.search(r"CgvCode=(\d+)", a.attrib["href"], re.I).group(1) name = re.search(r"\(([^)]+)\)", a.text_content().strip(), re.I).group(1) except (IndexError, KeyError, AttributeError): self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(Format(id=id, name=name)) return items
def get_movies(self): resp = self._browser.get_page("movie_list") doc = parse_html(resp.content) items = [] for a in doc.cssselect("table.list tbody tr .subject a"): try: id = re.search(r"MovieIdx=(\d+)", a.attrib["href"], re.I).group(1) name = a.text_content().strip() except (IndexError, KeyError, AttributeError): self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(Movie(id=id, name=name)) return items
def get_theaters(self, mid, fid): resp = self._browser.get_page("theater_list", mid=mid, fid=fid) doc = parse_html(resp.content) items = [] for a in doc.cssselect("#areaTheater ul.theaterlist > li.theaterlist > a"): try: regex = r"'(\d{{4}})'\s*,\s*'{0}'\s*,\s*'{1}'".format(fid, mid) id = re.search(regex, a.attrib["href"], re.I).group(1) name = a.text_content() except (IndexError, KeyError, AttributeError): self.logger.warning("Markup is not as expected", exc_info=True) else: items.append(Theater(id=id, name=name)) return items
def _login_required(self, response): doc = parse_html(response.content) scripts = doc.xpath('//script/text()') return (len(scripts) >= 2 and re.search(ur'alert.*로그인', scripts[0]) and re.search(ur'location\.replace', scripts[1]))
def _login_required(self, resp): doc = parse_html(resp.content) return doc.cssselect(".member_login")