Пример #1
0
    def get_movie_information(link):
        """Scrape the movie page for information."""
        page = requests.get(link)
        soup = BeautifulSoup(page.text, "html.parser")
        try:
            title = clean_title(soup.find("div", {"id": "con20"}).find("h1").get_text())
        except AttributeError:
            title = clean_title(soup.find("h1").get_text())

        try:
            poster = soup.find(class_="dtMainPic").get('href')
        except AttributeError:
            # No poster exists
            poster = None

        info_table = soup.find("div", attrs={"id": "dtSpecR"}).find("table") \
            .find_all("tr")
        for row in info_table:
            header = row.find("th").get_text()
            # FIXME: Is there a better way to do this than with 5 conditions ?
            if header == "メディア":
                movie_format = row.find("td").get_text().replace("\n", "")
                continue
            elif header == "出演者":
                actors = []
                actors_soup = row.find("td").find_all("a")
                for actor in actors_soup:
                    actors.append((actor.get_text()))
                continue
            elif header == "発売日":
                release_date = row.find("td").get_text().replace("\n", "")
                continue
            elif header == "メーカー":
                maker = row.find("td").get_text().replace("\n", "")
                continue
            elif header == "シリーズ":
                set = row.find("td").get_text().replace("\n", "")
                continue
        plot = soup.find("div", attrs={"id": "dtSpec"}).find("p").get_text()
        # TODO: Refactor the synopsis cleaning
        if plot.startswith("\r\n"):
            plot = plot[2:]
        if plot.endswith("\r\n"):
            plot = plot[:-2]
        return {
            "title": title,
            "poster": poster,
            "actors": actors,
            "format": movie_format,
            "release_date": release_date,
            "studio": maker,
            "set": set,
            "plot": plot
        }
Пример #2
0
 def _search_eic_av(self, title):
     page = requests.get(self.EIC_AV_SEARCH_URL + urllib.parse.quote(title))
     soup = BeautifulSoup(page.text, "html.parser")
     result_list = soup.find_all("div", class_="list")
     if self._has_multiple_formats(title):
         for result in result_list:
             # TODO: Clean titles in a more generic fashion
             _title = clean_title(result.find("h2")
                                  .get_text()
                                  .split('\n', 1)[0])
             soup = BeautifulSoup(requests.get(
                 self.EIC_AV_SEARCH_URL +
                 urllib.parse.quote(_title)),
                 "html.parser")
             for _result in soup.find_all("div", class_="list"):
                 result_title = _result.find("h2") \
                     .get_text().split('\n', 1)[0]
                 link = self.EIC_AV_BASE_URL + _result.find("a").get('href')
                 self.results.append({"title": result_title, "link": link})
     else:
         for result in result_list:
             result_title = result.find("h2") \
                 .get_text().split('\n', 1)[0]
             link = self.EIC_AV_BASE_URL + result.find("a").get('href')
             self.results.append({"title": result_title, "link": link})
Пример #3
0
def test_clean_title():
    assert utils.clean_title("test [DVD]") == "test"
    assert utils.clean_title("test [Blu-ray]") == "test"
    assert utils.clean_title("test space") == "test space"
    assert utils.clean_title("test Blu-ray版") == "test"
    assert utils.clean_title("test DVD版") == "test"
    assert utils.clean_title("test 【特価】") == "test"
    assert utils.clean_title("test【10%POINTBACK】") == "test"
Пример #4
0
    def get_movie_information(link):
        """Scrape the movie page for information."""
        soup = BeautifulSoup(urllib.request.urlopen(link), "html.parser")

        title_pre_clean = clean_title(soup.find("h1").get_text()).split(' | ')
        title = title_pre_clean[0] + " " + title_pre_clean[1]
        movie_format = title_pre_clean[2]

        try:
            poster = soup.find(class_="p_image").find("img").get('src')
        except AttributeError:
            # No poster exists
            poster = None

        plot = soup.find("div", class_="description").get_text()
        # TODO: Refactor the synopsis cleaning
        if plot.startswith("\r\n"):
            plot = plot[2:]
        if plot.endswith("\r\n"):
            plot = plot[:-2]

        production_info_table = soup.find("table", class_="pro_info") \
            .find_all("tr")
        for row in production_info_table:
            header = row.find("th").get_text()
            # FIXME: Is there a better way to do this than with 5 conditions ?
            if header == "メーカー":
                maker = row.find("td").get_text().replace("\n", "")
                continue
            elif header == "発売日":
                release_date = row.find("td").get_text().replace("\n", "")
                continue

        production_info_table = soup.find("table", class_="idol_info") \
            .find_all("tr")
        actors = []
        for row in production_info_table[1:]:
            cell = row.find("td").get_text()
            actors.append(cell)
        return {
            "title": title or None,
            "poster": poster or None,
            "actors": actors or None,
            "format": movie_format or None,
            "release_date": release_date or None,
            "studio": maker or None,
            "set": None,
            "plot": plot or None
        }