def delete(self): """ Deletes the list from Letterboxd. This cannot be undone! NOTE: after deleting a list, the instance will become unusable. """ if not util.yn( "Are you sure you want to delete the list? This cannot be undone!" ): return SESSION.request("POST", self.suburl_delete) self.soup = None
def new(cls, name, **kwargs): """ :: Alternative Constructor :: Creates a new list, as opposed to initialising this class regularly, which expects the name passed to already exist as a list on Letterboxd. This method makes a request first to create the list It then returns an instance in the regular way by calling the __init__ method(), which anticipates an existing list. Since we have already created the list, this is fine. Parameters: - name (str) - the name of the list Optional Parameters - tags (list) - e.g. [horror, 1980s] - public (bool) - ranked (bool) - description (str) - e.g. "These are my favourite films" - entries (list of dicts) - films in the list and any notes about them """ # Edge case - list name passed is not type string if not name or not isinstance(name, str): raise TypeError(f"name must be non-empty string, not {name}") # Default values for the list which will be used # in the event that the corresponding keyword arguments are not provided default_values = { 'tags': [], 'public': False, 'ranked': False, 'description': '', 'entries': [] } ## Add default values for any missing keys list_data = { attribute: value if attribute not in kwargs else kwargs[attribute] for attribute, value in default_values.items() } ## Add list_name and empty_id # (the id_ will be generated automatically when making the list creation request) list_data['name'] = name list_data['list_id'] = '' ## Convert the list_data into values which can be passed to a request # This involves changing the types of some of the values post_data = cls.make_post_data(list_data) ## Create list SESSION.request("POST", suburl=cls.save_url, data=post_data) # Since the list has been created, creating an instance should now work # the same way that it would with any existing list return cls(name)
def comment_soup(self): """ Returns the soup containing information about the list's existing comments.""" response = SESSION.request("GET", f"csi/list/{self._id}/comments-section/?", params={'esiAllowUser': True}) soup = make_soup(response) return soup
def __get_rating_soup(self): """ The film's rating info is loaded from a different page Hence we make the request to this separate page to get it r-type: BeautifulSoup """ suburl = f"csi/film/{self.path}/rating-histogram/" request = SESSION.request("GET", suburl) return make_soup(request)
def __get_info_soup(self): """ Go the main film_page and grab the soup. r-type: BeautifulSoup""" request = SESSION.request("GET", self.suburl) soup = make_soup(request) page_wrapper = soup.find('div', id='film-page-wrapper') return page_wrapper
def load(self, *args): """ Overload of load from parent class. Uses the edit view rather than standard list view. """ list_name = self.get_formatted_name() edit_url = f"{SESSION.username}/list/{list_name}/edit" request = SESSION.request("GET", edit_url) soup = make_soup(request) self.soup = soup
def get_film_names(self): """ Returns each id in the film list together with the corresponding film_name. """ response = SESSION.request("GET", self.view_list) soup = make_soup(response) if not (page_navigator := soup.find('div', class_='pagination')): last_page = 1
def get_blocked(): """ Returns a list of the users in your block list. NOTE: You can only see who you've blocked, hence there is no username argument for this function unlike following and followers. """ username = SESSION.username request = SESSION.request("GET", f"{username}/blocked/") soup = make_soup(request) return __get_people(soup)
def load(self, username): """ load an instance for an existing list, given its name. """ list_name = self.get_formatted_name() view_list = f"{username}/list/{list_name}/" # Make request to list url on Letterboxd response = SESSION.request("GET", view_list) soup = make_soup(response) self.soup = soup
def get_page_of_film_names(self, page_num): """ Returns a dictionary key: film_id value: film_name for all the films on that page of the list. Example: {film_id: film_name} """ response = SESSION.request("GET", f"{self.view_list}page/{page_num}/") soup = make_soup(response) ul = soup.find('ul', class_='film-list') page_results = { int(li.find('div').get('data-film-id')): li.find('img').get('alt') for li in ul.find_all('li') } return page_results
def add_comment(self, comment): """ Adds a comment to the list. """ SESSION.request("POST", self.add_comment_url, data={'comment': comment})
def get_comment_text(suburl): """ Returns the body of the comment. """ response = SESSION.request("GET", suburl) return make_soup(response).get_text()
if not (comments := self.comments): raise Exception("No comments to delete!") if type(comment_id) not in (str, int): raise TypeError( f"Invalid type for comment_id: {type(comment_id)}. Should be int" ) if isinstance(comment_id, str): comment_id = int(comment_id) if comment_id not in [i['id'] for i in comments]: raise Exception(f"Unable to locate id: {comment_id}") delete_comment_url = f"ajax/filmListComment:{comment_id}/delete-comment/" # Make post request to delete comment SESSION.request("POST", suburl=delete_comment_url) """ ** Film names ** """ def get_page_of_film_names(self, page_num): """ Returns a dictionary key: film_id value: film_name for all the films on that page of the list. Example: {film_id: film_name} """ response = SESSION.request("GET", f"{self.view_list}page/{page_num}/") soup = make_soup(response)
def get_followers(username=SESSION.username): """ Returns a list of the users a given user is followed by. """ request = SESSION.request("GET", f"{username}/followers/") soup = make_soup(request) return __get_people(soup)
def __call__(self, **kwargs): """ Returns a list of film_ids that correspond with the given search parameters. If not parameters are given, all film_ids in the watched_list will be returned Keyword Arguments: rated_only(bool) year(str or None): Options :- - 4 digits e.g. 1975 - 4 digits + s e.g. 1970s # functions as decade genre(str or None): Contraints :- - must be in genre_list service(str or None): Constraints :- - must be in service_list rating(float or None): Constraints :- - must be in inclusive range (0.5, 5) - decimal must be 0.5 or 0, like Letterboxd ratings sort_by(str): How do you want the results sorted? Constraints :- - must be in sort_list Options :- - name - popular - date-earliest (release date) - date-latest - rating (average rating) - rating-lowest - your-rating (session user's rating) - your-rating-lowest - entry-rating (username's rating) - entry-rating-lowest - shortest (film length) - longest filters(list): Constraints :- - must be in SESSION's filters_dict Options :- (updated: 2020-11-20) - show-liked OR hide-liked - show-logged OR hide-logged - show-reviewed OR hide-reviewed - show-watchlisted OR hide-watchlisted - show-shorts OR hide-shorts - hide-docs - hide-unreleased Example suburl in full: - username/films/ratings/ year(or decade)/2015/genre/horror/on/amazon-gbr/by/rating """ # Get valid filters for the request if 'filters' in kwargs: filters = self.get_valid_filters(kwargs.pop('filters')) else: filters = '' # Set cookie according to filters requests_jar = requests.cookies.RequestsCookieJar() requests_jar.set('filmFilter', filters) # Get the suburl for request suburl = self.build_suburl(**kwargs) film_ids = [] page_num = 1 while len(film_ids) % 18 == 0: print("page", page_num) request = SESSION.request("GET", suburl + f"page/{page_num}/", cookies=requests_jar) soup = make_soup(request) films_on_page = [ i.find('div').get('data-film-id') for i in soup.find_all('li', class_='poster-container') ] """ Edge case: the last page has exactly 18 films. The scraper goes to the next page which is blank, This means that the films_on_page list is empty, so can use this to break from the loop. """ if not films_on_page: break film_ids += films_on_page page_num += 1 return film_ids
class FilmRaters(): ## Max pages Letterboxd allows page_limit = 10 ## Ratings per page ratings_per_page = 500 ## Max results in total max_results = page_limit * ratings_per_page ## suburls for getting ratings suburl_rating_highest = 'ratings/' suburl_rating_lowest = 'ratings/by/entry-rating-lowest/' def __init__(self, film): """ Ensure film in correct format """ if not film or type(film) is not str: raise Exception(f"Invalid film name {film}, must be string") self.film = film.replace(' ', '-') ## Get information about the film's overall rating and ratings spread try: film_ratings = FilmInfo(self.film).ratings self.film_ratings = [v for k,v in sorted(film_ratings.items())] except: raise Exception("Failed to obtain film data for film:", self.film) def __repr__(self): """ Example: < FilmRaters Film: Citizen-kane > """ return f"< {self.__class__.__name__}\tFilm: {self.film.title()} >" def __len__(self): """ Returns the total number of ratings a film has. """ return sum(self.film_ratings) @property def suburl_film(self): return f"film/{self.film}/" def __get_route(self, target_rating, target_rating_count): """ Determines the method (in a non-pythonic sense) by which to sort the request url in order to make sure all results can be obtained. It will return the appropriate suburl, whether it be a regular or reverese (starting from the lowest rating) search. In addition, it will return the page number at which the rating starts. """ ## Total ratings lower and higher than the target rating. # For example if the target_rating is 3, # and there are 10 ratings of 1 and 5 ratings of 2, # the lower_ratings would be 15 lower_ratings = sum([v for v in self.film_ratings[0:target_rating-1]]) higher_ratings = sum([v for v in self.film_ratings[target_rating+1:len(self.film_ratings)]]) # Cannot get users with this rating because there are not enough pages to get to the middle # ratings. Since you can only view ratings from the top or bottom. if not any([i < self.max_results for i in (lower_ratings, higher_ratings)]): return False # There are less ratings above than below the target_rating # So we'll scrape by sorting ratings highest to lowest elif higher_ratings <= lower_ratings: page_start = ( higher_ratings // self.ratings_per_page ) + 1 page_end = ( ( higher_ratings + target_rating_count ) // self.ratings_per_page ) + 1 sort_by = self.suburl_rating_highest # The opposite is true: there are less ratings below than above # So we'll scrape by lowest to highest elif lower_ratings < higher_ratings: page_start = ( lower_ratings // self.ratings_per_page ) + 1 page_end = ( ( lower_ratings + target_rating_count ) // self.ratings_per_page ) + 1 sort_by = self.suburl_rating_lowest # Ensure that target_rating has not pushed us over maximum page limit if page_end > 10: page_end = 10 return sort_by, page_start, page_end def __call__(self, target_rating=4, limit=None): """ Returns a list of users who've rated a film a given rating. In some instances there are too many ratings to obtain middle-ground ratings like 5 or 6. This is because Letterboxd limits the number of pages to 10, and you can only sort by highest or lowest. In such instances, the function will simply return False. r-type: list (or False, if could not get results) """ ## Edge cases if type(target_rating) is not int or target_rating not in range(1,11): raise ValueError("Rating must be int value within inclusive range 1-10") target_rating_count = self.film_ratings[target_rating-1] ## Get route to getting results if not (route := self.__get_route(target_rating, target_rating_count)): # Could not get any results return False sort_by, page_start, page_end = route ## Begin scraping process users = [] # results list if not limit: limit = target_rating_count # loop will break at result limit suburl = f"{self.suburl_film}{sort_by}" page_num = page_start while page_num in range(page_start, page_end+1) and len(users) < limit: ## Make request to each page full_suburl = f"{suburl}page/{page_num}" request = SESSION.request("GET", full_suburl) soup = make_soup(request) ## Could not find tag associated with target_rating if not (target_rating := soup.find('span', class_=f'rated-large-{target_rating}')): if not users: # Failed to get any results raise Exception("Could not get results") else: # There is no section for the int(rating) on this page break # Parent tag that contains the information on users listed under each rating rating_group = target_rating.parent.parent page_results = [i.get('href')[1:-1] for i in rating_group.find_all('a', class_='avatar')] users += page_results page_num += 1