def _search_movie(self, title, results, genre_id, search_type, start_year, end_year): """Return list of movies""" grabber = HTMLGrabber() li_list = [] img_list = [] params = {"q": title.encode("utf-8"), "page": 1} if genre_id: params['genreIds'] = genre_id if start_year: params['startYear'] = start_year if end_year: params['endYear'] = end_year search_url = "" if search_type: search_url = "/" + search_type url = filmweb_search_blank + search_url + "?" + urlencode(params) content = grabber.retrieve(url) # @Make search more pages not only 1 soup = BeautifulSoup(content) li_list.extend(soup.findAll('div', {'class': 'hitDescWrapper'})) img_list.extend(soup.findAll('div', {'class': 'hitImage'})) for i, li in enumerate(li_list): a = li.find('a', {'class': 'hdr hdr-medium hitTitle'}) title = a.text url = a['href'] # have to do another check because sometimes url doesnt provide movieID aimg = img_list[i].find('a') if aimg is not None: img = aimg.find("img") movieID = get_real_id(url, img['src']) yield movieID, title, url
def parse_episodes(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/episodes") soup = BeautifulSoup(content) seasons = soup.find('dl', {'class': 'episodesTable'}) episodes_list = [] for element in seasons.children: if element.name == 'dt': h3 = element.next_element try: season_number = int(h3.text.split(" ")[1]) except: break if element.name == 'dd': li_episodes = element.find_all("li") for episode in li_episodes: episode_number = int( re.match(r'\d+', episode.contents[0].text).group()) date_str = episode.find('div', { 'class': 'countryPremiereDate' }).text episode_name = episode.find('div', {'class': 'title'}).text episode_date = datetime.datetime.strptime( date_str, '%d.%m.%Y') episodes_list.append({ 'season': season_number, 'number': episode_number, 'date': episode_date, 'name': episode_name }) return episodes_list
def parse_cast(self): personList = [] for url in ["/cast/actors", "/cast/crew"]: grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + url) soup = BeautifulSoup(content) for filmCastBox in soup.findAll("div", {'class': 'filmCastBox'}): personType = filmCastBox.previous personTypesChange = { 'obsada': 'aktor', 'scenariusz': 'scenarzysta', 'produkcja': 'producent' } # backward compatibility if personType in personTypesChange: # personType = personTypesChange[personType] for cast in filmCastBox.findAll('tr', id=re.compile("role_")): url_html = cast.find("a", {'class': 'pImg49'}) url = url_html['href'] img_html = url_html.find("img") pattern_images = [ "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*).([0-3]*).jpg", "http://1.fwcdn.pl/p/([0-9]{2})/([0-9]{2})/(?P<id>[0-9]*)/([0-9]*)_1.([0-3]*).jpg" ] pattern_link = "/person/(.+)-(?P<id>[0-9]*)" id = 0 results = re.search(pattern_link, url_html['href']) if results: id = results.group("id") else: for pattern in pattern_images: results = re.search(pattern, repr(img_html.extract())) if results: id = results.group("id") role_html = cast.find('a', {'rel': 'v:starring'}) role = role_html.parent.nextSibling.nextSibling.text name = role_html.parent.nextSibling.text personList.append( Person(id, title=name, roleType=personType, roleName=role, url=url)) return personList
def get_list_genres(): grabber = HTMLGrabber() content = grabber.retrieve(filmweb_search_blank + "/film") soup = BeautifulSoup(content) genres = soup.findAll('input', {'name': 'genreIds'}) list_genre = [] for genre in genres: genre_id = genre.attrs['value'] genre_name = genre.next_element.next_element.text list_genre.append({'genre_id': genre_id, 'genre_name': genre_name}) return list_genre
def parse_posters(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/posters") soup = BeautifulSoup(content) photoList = soup.find("ul", 'block-list postersList') images = [] for photo in photoList("img", {'class': "lbProxy"}): images.append({ 'href': photo['src'].replace(".2.jpg", '.3.jpg'), 'thumb': photo['src'] }) return images
def parse_photos(self): grabber = HTMLGrabber() content = grabber.retrieve(self.obj.url + "/photos") soup = BeautifulSoup(content) photos_list = soup.find("ul", {'class', 'photosList'}) images = [] for photo in photos_list.findAll("img"): images.append({ 'href': photo.parent['href'], 'thumb': photo['src'], 'image': photo.parent['data-photo'] }) return images
def _search_person(self, title, results=20): # http://www.filmweb.pl/search/person?q=Tom+Cruise """Return list of persons""" grabber = HTMLGrabber() p_title = grabber.encode_string(title) li_list = [] img_list = [] content = grabber.retrieve(filmweb_person_search % (p_title, 1)) #@Make search more pages not only 1 soup = BeautifulSoup(content) li_list.extend(soup.findAll('div', {'class': 'hitDescWrapper'})) img_list.extend(soup.findAll('div', {'class': 'hitImage'})) for i, li in enumerate(li_list): a = li.find('a', {'class': 'hdr hdr-medium hitTitle'}) title = a.text url = a['href'] # have to do another check because sometimes url doesnt provide movieID aimg = img_list[i].find('a') if aimg is not None: img = aimg.find('img') personID = get_real_id(url, img['src']) yield personID, title, url
def _download_content(self, url): from filmweb.parser.HTMLGrabber import HTMLGrabber grabber = HTMLGrabber() self._content = grabber.retrieve(url) self._soup = BeautifulSoup(self.content)