def get_movie_information(link): """Scrape the movie page for information.""" page = requests.get(link) soup = BeautifulSoup(page.text, "html.parser") try: title = clean_title(soup.find("div", {"id": "con20"}).find("h1").get_text()) except AttributeError: title = clean_title(soup.find("h1").get_text()) try: poster = soup.find(class_="dtMainPic").get('href') except AttributeError: # No poster exists poster = None info_table = soup.find("div", attrs={"id": "dtSpecR"}).find("table") \ .find_all("tr") for row in info_table: header = row.find("th").get_text() # FIXME: Is there a better way to do this than with 5 conditions ? if header == "メディア": movie_format = row.find("td").get_text().replace("\n", "") continue elif header == "出演者": actors = [] actors_soup = row.find("td").find_all("a") for actor in actors_soup: actors.append((actor.get_text())) continue elif header == "発売日": release_date = row.find("td").get_text().replace("\n", "") continue elif header == "メーカー": maker = row.find("td").get_text().replace("\n", "") continue elif header == "シリーズ": set = row.find("td").get_text().replace("\n", "") continue plot = soup.find("div", attrs={"id": "dtSpec"}).find("p").get_text() # TODO: Refactor the synopsis cleaning if plot.startswith("\r\n"): plot = plot[2:] if plot.endswith("\r\n"): plot = plot[:-2] return { "title": title, "poster": poster, "actors": actors, "format": movie_format, "release_date": release_date, "studio": maker, "set": set, "plot": plot }
def _search_eic_av(self, title): page = requests.get(self.EIC_AV_SEARCH_URL + urllib.parse.quote(title)) soup = BeautifulSoup(page.text, "html.parser") result_list = soup.find_all("div", class_="list") if self._has_multiple_formats(title): for result in result_list: # TODO: Clean titles in a more generic fashion _title = clean_title(result.find("h2") .get_text() .split('\n', 1)[0]) soup = BeautifulSoup(requests.get( self.EIC_AV_SEARCH_URL + urllib.parse.quote(_title)), "html.parser") for _result in soup.find_all("div", class_="list"): result_title = _result.find("h2") \ .get_text().split('\n', 1)[0] link = self.EIC_AV_BASE_URL + _result.find("a").get('href') self.results.append({"title": result_title, "link": link}) else: for result in result_list: result_title = result.find("h2") \ .get_text().split('\n', 1)[0] link = self.EIC_AV_BASE_URL + result.find("a").get('href') self.results.append({"title": result_title, "link": link})
def test_clean_title(): assert utils.clean_title("test [DVD]") == "test" assert utils.clean_title("test [Blu-ray]") == "test" assert utils.clean_title("test space") == "test space" assert utils.clean_title("test Blu-ray版") == "test" assert utils.clean_title("test DVD版") == "test" assert utils.clean_title("test 【特価】") == "test" assert utils.clean_title("test【10%POINTBACK】") == "test"
def get_movie_information(link): """Scrape the movie page for information.""" soup = BeautifulSoup(urllib.request.urlopen(link), "html.parser") title_pre_clean = clean_title(soup.find("h1").get_text()).split(' | ') title = title_pre_clean[0] + " " + title_pre_clean[1] movie_format = title_pre_clean[2] try: poster = soup.find(class_="p_image").find("img").get('src') except AttributeError: # No poster exists poster = None plot = soup.find("div", class_="description").get_text() # TODO: Refactor the synopsis cleaning if plot.startswith("\r\n"): plot = plot[2:] if plot.endswith("\r\n"): plot = plot[:-2] production_info_table = soup.find("table", class_="pro_info") \ .find_all("tr") for row in production_info_table: header = row.find("th").get_text() # FIXME: Is there a better way to do this than with 5 conditions ? if header == "メーカー": maker = row.find("td").get_text().replace("\n", "") continue elif header == "発売日": release_date = row.find("td").get_text().replace("\n", "") continue production_info_table = soup.find("table", class_="idol_info") \ .find_all("tr") actors = [] for row in production_info_table[1:]: cell = row.find("td").get_text() actors.append(cell) return { "title": title or None, "poster": poster or None, "actors": actors or None, "format": movie_format or None, "release_date": release_date or None, "studio": maker or None, "set": None, "plot": plot or None }