def get_playlist_info(playlist_id): """Get meta information from playlist website.""" url = get_playlist_video_url(playlist_id) soup = get_soup(url) if soup is None: logger.error("Got invalid response for playlist: {url}") sys.exit(1) header = soup.find(id="playlistTopHeader") if header is None: logger.info(f"Couldn't get info for playlist: {url}") check_logged_out(soup) sys.exit(1) title = header.find("span", {"id": "watchPlaylist"}) name = title.text.strip() name = name.replace(" ", "_") name = re.sub(r"[\W]+", "_", name) return {"name": name}
def download_playlist_videos(session, playlist): """Download all videos of a playlist.""" viewkeys = set(get_playlist_video_viewkeys(playlist)) if len(viewkeys) == 0: logger.error(f"Found 0 videos in playlist {Playlist.id}. Aborting") sys.exit(1) full_success = True logger.info(f"Found {len(viewkeys)} videos.") for viewkey in viewkeys: clip = Clip.get_or_create(session, viewkey) # The clip has already been downloaded, skip it. if clip.completed: if clip.title is not None and clip.extension is not None: target_path = get_clip_path(playlist.name, clip.title, clip.extension) link_duplicate(clip, target_path) continue success, info = download_video(viewkey, f"playlists/{playlist.name}") if success: clip.title = info["title"] clip.tags = info["tags"] clip.cartegories = info["categories"] clip.completed = True clip.location = info["out_path"] clip.extension = info["ext"] logger.info(f"New video: {clip.title}") else: full_success = False session.commit() time.sleep(20) return full_success
def get_channel_info(channel_id): """Get meta information from channel website.""" url = get_channel_video_url(channel_id) soup = get_soup(url) if soup is None: logger.error("Got invalid response for channel: {url}") sys.exit(1) profile = soup.find(id="channelsProfile") if profile is None: logger.info(f"Couldn't get info for channel: {url}") check_logged_out(soup) sys.exit(1) header = profile.find("div", {"class": "header"}) wrapper = profile.find("div", {"class": "bottomExtendedWrapper"}) title = profile.find("div", {"class": "title"}) name = title.find("h1").text.strip() name = name.replace(" ", "_") name = re.sub(r"[\W]+", "_", name) return {"name": name}
def update(args): """Get all information about a user and download their videos.""" session = get_session() threshold = datetime.now() - timedelta(hours=8) # Go through all users users = (session.query(User).filter(User.last_scan <= threshold).order_by( User.key).all()) for user in users: # Re query the user type, since this can change over time print(user.key) info = get_user_info(user.key) user.user_type = info["type"] logger.info(f"\nStart downloading user: {user.name}") if download_user_videos(session, user): user.last_scan = datetime.now() session.commit() # Go through all playlists playlists = (session.query(Playlist).filter( Playlist.last_scan <= threshold).order_by(Playlist.name).all()) for playlist in playlists: logger.info(f"\nStart downloading playlist: {playlist.name}") if download_playlist_videos(session, playlist): user.last_scan = datetime.now() session.commit() # Go through all channels channels = (session.query(Channel).filter( Channel.last_scan <= threshold).order_by(Channel.name).all()) for channel in channels: logger.info(f"\nStart downloading channel: {channel.name}") if download_channel_videos(session, channel): user.last_scan = datetime.now() session.commit() clips = (session.query(Clip).filter(Clip.completed.is_(False)).filter( Clip.location.isnot(None)).all()) for clip in clips: download_video(clip.viewkey, name=os.path.dirname(clip.location)) clip.completed = True session.commit()
def get_channel_viewkeys(channel): """Scrape all public viewkeys of the channel's videos.""" is_premium = os.path.exists("http_cookie_file") if is_premium: url = f"https://www.pornhubpremium.com/channels/{channel.id}/videos" else: url = f"https://www.pornhub.com/channels/{channel.id}/videos" soup = get_soup(url) if soup is None: logger.error(f"Failed to find video page for channel {channel.id}") check_logged_out(soup) sys.exit(1) pages = 1 hasNavigation = False hasEndlessScrolling = False # Some sites have a navigation at the bottom navigation = soup.find("div", {"class": "pagination3"}) if navigation is not None: children = navigation.findChildren("li", {"class": "page_number"}) pages = len(children) + 1 hasNavigation = True # Others have a button for "endless scrolling" # In that case we have to search as long as elif soup.find(id="moreDataBtnStream"): hasEndlessScrolling = True keys = [] current_page = 1 next_url = url while current_page <= pages: # Check if the next site has another "endless scrolling" button as qell # If that's the case, increase the counter if hasEndlessScrolling and soup.find(id="moreDataBtnStream"): pages += 1 logger.info(f"Crawling {next_url}") # Channel with normal video upload list videos = soup.find(id="showAllChanelVideos") if videos is None: logger.error(f"Couldn't find channel videos in site: {url}") check_logged_out(soup) sys.exit(1) for video in videos.find_all("li"): if video.has_attr("_vkey"): keys.append(video["_vkey"]) current_page += 1 next_url = url + f"?page={current_page}" time.sleep(4) soup = get_soup(next_url) # We couldn't get the next url. if soup is None: break return keys
def get_video_upload_viewkeys(user, public=False): """Scrape viewkeys from the user's user/videos/upload route.""" is_premium = os.path.exists("premium") if is_premium: url = ( f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/premium" ) else: url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/upload" if public: if is_premium: url = f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/upload" else: url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/public" soup = get_soup(url) if soup is None: logger.info(f"Nothing on {url}") return [] pages = 1 hasNavigation = False hasEndlessScrolling = False # Some sites have a navigation at the bottom navigation = soup.find("div", {"class": "pagination3"}) if navigation is not None: children = navigation.findChildren("li", {"class": "page_number"}) pages = len(children) + 1 hasNavigation = True # Others have a button for "endless scrolling" # In that case we have to search as long as elif soup.find(id="moreDataBtnStream"): hasEndlessScrolling = True keys = [] current_page = 1 next_url = url while current_page <= pages: # Check if the next site has another "endless scrolling" button as qell # If that's the case, increase the counter if hasEndlessScrolling and soup.find(id="moreDataBtnStream"): pages += 1 logger.info(f"Crawling {next_url}") videoSection = soup.find("div", {"class": "videoUList"}) pornstarVideoSection = soup.find(id="pornstarsVideoSection") claimedUploadedVideoSection = soup.find( id="claimedUploadedVideoSection") # Users with normal video upload list if videoSection is not None: videos = videoSection.find(id="moreData") # Users with pornstarVideoSection elif pornstarVideoSection is not None: videos = pornstarVideoSection # Dunno what this is elif claimedUploadedVideoSection is not None: videos = claimedUploadedVideoSection else: logger.error( f"Couldn't find video section on {next_url}. Did we log out?") if check_logged_out(soup): sys.exit(1) return [] for video in videos.find_all("li"): if video.has_attr("data-video-vkey"): keys.append(video["data-video-vkey"]) current_page += 1 next_url = url + f"?page={current_page}" time.sleep(4) soup = get_soup(next_url) # We couldn't get the next url. if soup is None: break return keys
def download_video(viewkey, name="single_videos"): """Download the video.""" # Decide which domain should be used, depending if the user has a premium account is_premium = os.path.exists("premium") if is_premium: video_url = f"https://www.pornhubpremium.com/view_video.php?viewkey={viewkey}" else: video_url = f"https://www.pornhub.com/view_video.php?viewkey={viewkey}" options = { "outtmpl": f"/data/Media/P**n/{name}/%(title)s.%(ext)s", "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "merge-output-format": "mp4", "quiet": True, "retries": 3, "nooverwrites": False, "continuedl": True, "yes-playlist": True, "add-metadata": True, "external-downloader": "aria2c", "external-downloader-args": "--no-conf --file-allocation=none -x16 -s16 -j5 -k5M -c -R", } if is_premium: options["cookiefile"] = "cookie_file" ydl = youtube_dl.YoutubeDL(options) tries = 0 while True: try: logger.info(f"Start downloading: {video_url}") info = ydl.extract_info(video_url) info[ "out_path"] = f'/data/Media/P**n/{name}/{info["title"]}.{info["ext"]}' return True, info except TypeError: # This is an error that seems to occurr from time to time # A short wait and retry often seems to fix the problem # This is something about pornhub not properly loading the video. logger.info("Got TypeError bug") time.sleep(20) tries += 1 # If this happens too many times, something else must be broken. if tries > 10: return False, None continue except DownloadError: # We got a download error. # Ignore for now and continue downloading the other videos logger.error( f"DownloadError: Failed to download video: {viewkey}.") return False, None time.sleep(6) return False, None