def parse_movie_info(self, page, page_url): try: content = self.random_delay_get(page_url).content file_name = self._get_temp_filename() with open(file_name, "w") as fd: fd.write(content) movie_urls = [] with open(file_name, "r") as fd: lines = fd.readlines() for line in lines: line = line.strip() if not "ulink" in line: continue dom = BeautifulSoup(line, "lxml") movie_urls.append(dom.body.a["href"]) os.remove(file_name) url_pre = page_url.split("/html", 1)[0] log.info("url_pre: %s", url_pre) for movie_url in movie_urls: movie_url = "".join([url_pre, movie_url]) self.parse_movie_detail_info(movie_url) except Exception: log.error("parse_movie_info failed! page: %s, page_url: %s", page, page_url, exc_info=True)
def parse_movie_list(self, movie_type, url): try: if not url.startswith("http"): url = "".join([self.base_url, url]) url_pre = url.rsplit("/", 1)[0] log.info("url_pre: %s", url_pre) content = self.random_delay_get(url).content page_urls = self._parse_list(content) for page, page_url in page_urls: page_url = "/".join([url_pre, page_url]) self.parse_movie_info(page, page_url) except Exception: log.error("parse_movie_list failed! movie_type: %s, url: %s", movie_type, url, exc_info=True)
def _parse_list(self, content): page_urls = [] file_name = self._get_temp_filename() with open(file_name, "w") as fd: fd.write(content) with open(file_name, "r") as fd: for line in fd.readlines(): line = line.strip() if not "option" in line: continue try: tag_option = BeautifulSoup(line, "lxml") page = int(tag_option.text) url = tag_option.body.option["value"] log.info("page: %s, url: %s", page, url) page_urls.append([page, url]) except: pass os.remove(file_name) return page_urls
def parse_index(self): index_content = self.sess.get(self.base_url).content dom = BeautifulSoup(index_content, "lxml") tag_div_menu = dom.find_all("div", {"id": "menu"}) menu_urls = {} tag_a = tag_div_menu[0].find_all("a") for url in tag_a: menu_urls[url.text] = url["href"] log.info("movie_type: %s, url: %s", url.text, url["href"]) log.info(menu_urls) for movie_type, url in menu_urls.iteritems(): if url in ["#", "index.html"]: log.info("ignore movie_type: %s, url: %s", url.text, url["href"]) continue self.parse_movie_list(movie_type, url)