def __init__(self): # # MongoDB Configuration # # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections # self.audit_framework_videos_col = self.db.audit_framework_videos self.audit_framework_videos_col = self.db[Config.AUDIT_FRAMEWORK_VIDEOS_COL] # Create a YouTube Video Downloader Object self.VIDEO_DOWNLOADER = YouTubeVideoDownloader() # Create Video Classifier Object self.VIDEO_ANNOTATOR = PseudoscienceClassifier() return
def __init__(self, user_profile): # Initialize Variables self.USER_PROFILE = user_profile self.TIME_TO_SLEEP_BETWEEN_EACH_VIDEO = 5 # seconds # # Configure Selenium ChromeDriver # self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit('[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"') self.driverOptions.add_argument('--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option('useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument("user-data-dir={0}/{1}-data".format(Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format(Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format(Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome(options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) # Create a YouTube Video Helper self.YOUTUBE_DOWNLOADER = YouTubeVideoDownloader() return
class BuildUserWatchHistory(object): """ Class that creates the Watch History of a given YouTube User Profile by watching a predefined number of YouTube Videos (minimum: 100 videos) """ def __init__(self, user_profile): # Initialize Variables self.USER_PROFILE = user_profile self.TIME_TO_SLEEP_BETWEEN_EACH_VIDEO = 5 # seconds # # Configure Selenium ChromeDriver # self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit('[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"') self.driverOptions.add_argument('--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option('useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument("user-data-dir={0}/{1}-data".format(Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format(Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format(Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome(options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) # Create a YouTube Video Helper self.YOUTUBE_DOWNLOADER = YouTubeVideoDownloader() return def __del__(self): # Close Selenium Browser self.close_selenium_browser() return def __exit__(self, exc_type, exc_val, exc_tb): # Close Selenium Browser self.close_selenium_browser() return def close_selenium_browser(self): # Close Selenium browser self.driver.close() return def get_user_proxy_server(self): """ Method that finds the proxy server of the User Profile :return: """ user_profiles_info = Utils.read_json_file(filename=Config.USER_PROFILES_INFO_FILENAME) for user_profile in user_profiles_info: if user_profile['nickname'] == self.USER_PROFILE: return user_profile['proxy'] print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'.format(self.USER_PROFILE)) sys.exit(errno.ECANCELED) def get_user_profile_watch_videos_from_file(self): """ Method that returns a list with all the YouTube Videos IDs that the current User Profile will watch to build a watch history :return: """ return Utils.read_file(filename='{0}/{1}_watch_history_videos.txt'.format(Config.USER_PROFILES_WATCH_VIDEOS_BASE_DIR, self.USER_PROFILE)) def get_video_duration(self, video_id): """ Method that returns the duration of the given YouTube Video :param video_id: YouTube Video Id :return: """ # Get Video Metadata video_metadata = self.YOUTUBE_DOWNLOADER.download_video_metadata(video_id=video_id, retrieve_recommended_videos=False) # Convert Video duration to seconds video_duration_seconds = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration']) return video_duration_seconds def is_user_authenticated(self): """ Method that verifies whether the user is authenticated or not """ self.driver.get('https://www.youtube.com') time.sleep(3) try: self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="avatar-btn"]'))) print('--- USER IS AUTHENTICATED') return True except (TimeoutError, NoSuchElementException): print('--- USER IS NOT AUTHENTICATED') return False def clear_user_watch_history(self): """ Method that deletes the YouTube's Watch History of the logged-in User Profile :return: """ # Load YouTube Activity Control Management page self.driver.get('https://myactivity.google.com/activitycontrols/youtube?utm_source=my-activity&hl=en') time.sleep(3) # Click "Deleted Activity by" button # self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/header/div[4]/div[2]/div/c-wiz/div/div/nav/a[3]'))).click() try: self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/header/div[4]/div[2]/div/c-wiz/div/div/nav/a[3]'))).click() except TimeoutException: # Click the other Delete button self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/c-wiz/div/div[2]/c-wiz[1]/div/div/div[2]/div[2]/div/button'))).click() # Select to DELETE ALL Activity self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[3]'))).click() # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) time.sleep(3) try: # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) self.driver.find_element_by_xpath('/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[4]/div/div[2]/button').click() except NoSuchElementException: # Let it pass since it means that there is no watch history to delete print('[{}] There is no Watch History to delete'.format(self.USER_PROFILE)) pass return def watch_youtube_video(self, video_id): """ Method that receives a specific YouTube Video ID and watch the full video like a normal YouTube user :param video_id: :return: """ # Load YouTube Video Page self.driver.get('https://www.youtube.com/watch?v={}&autoplay=1'.format(video_id)) time.sleep(3) # Watch the Video try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[5]/button'))).click() except TimeoutException: try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[4]/button'))).click() except TimeoutException: print('[VIDEO: {0}] ERROR: WATCH VIDEO button not found...'.format(video_id)) pass # Like the Video to increase satisfaction score try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="top-level-buttons"]/ytd-toggle-button-renderer[1]/a'))).click() except TimeoutException: print('[VIDEO: {0}] ERROR: Video LIKE button not found...'.format(video_id)) pass # Get Video duration in seconds and then sleep for that time so that we watch the whole video video_duration_seconds = self.get_video_duration(video_id=video_id) print('--- [VIDEO: {0}] Sleeping for {1} secs to watch the whole video...'.format(video_id, video_duration_seconds)) time.sleep(video_duration_seconds) return def build_watch_history(self, watch_videos_list=None, clear_watch_history=True): """ Method that builds the watch history of the current selected YouTube User Profile :param watch_videos_list: :param clear_watch_history: :return: """ # Find the videos that will be used to build the watch history of the user if watch_videos_list is None: watch_videos_list = self.get_user_profile_watch_videos_from_file() # Check if the correct amount of videos to be watched is available if len(watch_videos_list) < Config.WATCH_HISTORY_VIDEOS_THRESHOLD: print('--- [{0}] Minimum amount of Videos to be watched: {1}. | TOTAL VIDEOS PROVIDED: {2}'.format(self.USER_PROFILE, Config.WATCH_HISTORY_VIDEOS_THRESHOLD, len(watch_videos_list))) sys.exit(errno.ECANCELED) # Ensure that User is Authenticated if not self.is_user_authenticated(): sys.exit(errno.EAUTH) # Clear User Profile History if clear_watch_history: self.clear_user_watch_history() # Build User Profile Watch History print('[{0}] Started building Watch History. TOTAL VIDEOS TO WATCH: {1}'.format(self.USER_PROFILE, len(watch_videos_list))) watched_videos_cntr = 1 for video_id in watch_videos_list: print('--- {0}/{1}. Watching Video: {2}'.format(watched_videos_cntr, len(watch_videos_list), video_id)) self.watch_youtube_video(video_id=video_id) time.sleep(self.TIME_TO_SLEEP_BETWEEN_EACH_VIDEO) watched_videos_cntr += 1 print('[{0}] Building Watch History has finished!'.format(self.USER_PROFILE)) return
def __init__(self, user_profile, search_term): """ Constructor :param user_profile: the User Profile nickname to perform the experiment :param search_term: the search term to search YouTube """ # Initialize Variables self.USER_PROFILE = user_profile self.AUDIT_SEARCH_TERM = search_term """ Configure Selenium ChromeDriver """ if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit('[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"') self.driverOptions.add_argument('--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option('useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument("user-data-dir={0}/{1}-data".format(Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format(Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format(Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome(options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) """ MongoDB Configuration """ # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections name self.audit_framework_videos_col = self.db[Config.AUDIT_FRAMEWORK_VIDEOS_COL] self.audit_framework_youtube_video_recommendations = self.db[Config.AUDIT_FRAMEWORK_YOUTUBE_VIDEO_RECS_COL] # Load YouTube Recommendations Monitor latest statusDetails from the file self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS = self.load_youtube_random_walks_details() """ YOUTUBE DATA API HELPER """ # Create a YouTube Video Helper self.YOUTUBE_VIDEO_DOWNLOADER = YouTubeVideoDownloader() return
class YouTubeVideoRecommendationsSectionAudit(object): """ Class that provides all the methods to perform audit experiments on YouTube's Video Recommendations section with logged-in users, non-logged-in users, and the YouTube Data API while assessing the effects of personalization on YouTube's Video recommendations. More precisely, during this audit experiment, we perform Live Random Walks on YouTube's Recommendation Graph, thus simulating the behavior of users casually browsing YouTube while watching videos according to recommendations. """ def __init__(self, user_profile, search_term): """ Constructor :param user_profile: the User Profile nickname to perform the experiment :param search_term: the search term to search YouTube """ # Initialize Variables self.USER_PROFILE = user_profile self.AUDIT_SEARCH_TERM = search_term """ Configure Selenium ChromeDriver """ if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit('[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"') self.driverOptions.add_argument('--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option('useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument("user-data-dir={0}/{1}-data".format(Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format(Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format(Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome(options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) """ MongoDB Configuration """ # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections name self.audit_framework_videos_col = self.db[Config.AUDIT_FRAMEWORK_VIDEOS_COL] self.audit_framework_youtube_video_recommendations = self.db[Config.AUDIT_FRAMEWORK_YOUTUBE_VIDEO_RECS_COL] # Load YouTube Recommendations Monitor latest statusDetails from the file self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS = self.load_youtube_random_walks_details() """ YOUTUBE DATA API HELPER """ # Create a YouTube Video Helper self.YOUTUBE_VIDEO_DOWNLOADER = YouTubeVideoDownloader() return def __del__(self): # Close Selenium Browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.close_selenium_browser() return def __exit__(self, exc_type, exc_val, exc_tb): # Close Selenium Browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.close_selenium_browser() return def close_selenium_browser(self): # Close Selenium browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.driver.close() return def get_user_proxy_server(self): """ Method that finds the proxy server of the User Profile :return: """ user_profiles_info = Utils.read_json_file(filename=Config.USER_PROFILES_INFO_FILENAME) for user_profile in user_profiles_info: if user_profile['nickname'] == self.USER_PROFILE: return user_profile['proxy'] print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'.format(self.USER_PROFILE)) sys.exit(errno.ECANCELED) def load_youtube_random_walks_details(self): """ Method that reads the YouTube Live Random Walks Details (latest status) file :return: a JSON dict with the contents of the file """ # Read status details from file if it exists if os.path.isfile(Config.AUDIT_VIDEO_RECOMMENDATIONS_SECTION_LOGFILE.format(self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE)): with open(file=Config.AUDIT_VIDEO_RECOMMENDATIONS_SECTION_LOGFILE.format(self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE)) as file: return dict(json.load(file)) # Create a new JSON dict and return random_walks_details = { 'STATUS': 'STOPPED', 'USER_PROFILE_TYPE': self.USER_PROFILE, 'SEED_SEARCH_TERM': self.AUDIT_SEARCH_TERM, 'SEARCH_RESULTS_THRESHOLD': Config.AUDIT_RANDOM_WALKS_SEARCH_RESULTS_THRESHOLD, 'RANDOM_WALK_MAX_HOPS': Config.AUDIT_RANDOM_WALKS_MAX_HOPS, 'CURRENT_RANDOM_WALK': 0, 'YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS': [], 'RANDOM_WALKS_HISTORY': [] } return random_walks_details def save_youtube_random_walks_details(self): """ Method that writes the provided YouTube Live Random Walks Audit experiment details in a file :return: None """ print(json.dumps(self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS, sort_keys=False, ensure_ascii=False, indent=4), file=open(file=Config.AUDIT_VIDEO_RECOMMENDATIONS_SECTION_LOGFILE.format(self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE), mode='w')) return def clear_user_watch_history(self): """ Method that deletes the YouTube's Watch History of the logged-in User Profile after and onwards the provided date :return: """ # Load YouTube Activity Control Management page self.driver.get('https://myactivity.google.com/activitycontrols/youtube?utm_source=my-activity&hl=en') time.sleep(2) # Click "Deleted Activity by" button try: self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/header/div[4]/div[2]/div/c-wiz/div/div/nav/a[3]'))).click() except TimeoutException: # Click the other Delete button self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/c-wiz/div/div[2]/c-wiz[1]/div/div/div[2]/div[2]/div/button'))).click() time.sleep(2) if Config.DELETE_WATCH_HISTORY_AFTER_DATE is not None: # Select to DELETE CUSTOM RANGE Activity self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[4]'))).click() time.sleep(3) # Select After Date from the calendar try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[3]/div/div[2]/div[1]/div/div[1]/div[1]/div/span/span/div'))).click() except TimeoutException: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[4]/div[2]'))).click() time.sleep(2) try: self.driver.find_element_by_xpath("//*[@data-day-of-month='{}']".format(Config.DELETE_WATCH_HISTORY_AFTER_DATE.split('-')[0])).click() except TimeoutException: self.wait.until(EC.element_to_be_clickable((By.XPATH, "//*[@data-day-of-month='{}']".format(Config.DELETE_WATCH_HISTORY_AFTER_DATE.split('-')[0])))).click() pass time.sleep(2) # Click the DELETE button to proceed self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[4]/div/div[2]/button'))).click() else: # Select to DELETE ALL Activity self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[3]'))).click() # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) time.sleep(3) try: # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) self.driver.find_element_by_xpath('/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[4]/div/div[2]/button').click() except NoSuchElementException: # Let it pass since it means that there is no watch history to delete print('[{}] There is no Watch History to delete'.format(self.USER_PROFILE)) pass return def is_user_authenticated(self): """ Method that verifies whether the user is authenticated or not """ self.driver.get('https://www.youtube.com') time.sleep(3) try: self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="avatar-btn"]'))) print('--- USER IS AUTHENTICATED') return True except (TimeoutError, NoSuchElementException): print('--- USER IS NOT AUTHENTICATED') return False def get_video_metadata(self, video_id, retrieve_recommended_videos): """ Method that downloads the metadata of the given YouTube Video using YouTube Data API :param video_id: YouTube Video Id :return: """ # Get Video Metadata video_metadata = self.YOUTUBE_VIDEO_DOWNLOADER.download_video_metadata(video_id=video_id, retrieve_recommended_videos=retrieve_recommended_videos) # Add additional information video_metadata['retrievedAt'] = str(dt.now()) video_metadata['statistics'] = dict() # video_metadata['relatedVideos'] = dict() return video_metadata def crawl_youtube_video_using_api(self, video_id): """ Method that downloads the metadata of the given YouTube Video and watches the video :param video_id: :param hop_number: :return: """ # Check if Video already exists in MongoDB video_metadata = self.audit_framework_videos_col.find_one({'id': video_id}) if not video_metadata: # Get Video Metadata using YouTube Data API video_metadata = self.get_video_metadata(video_id=video_id, retrieve_recommended_videos=True) if video_metadata is None: return None # Add Video Annotation information video_metadata['classification'] = dict() video_metadata['classification']['classification_category'] = None # Insert video to MongoDB self.audit_framework_videos_col.insert_one(video_metadata) else: # Retrieve Related Videos only video_metadata['relatedVideos'] = self.YOUTUBE_VIDEO_DOWNLOADER.get_recommended_videos(video_id=video_id) # Update Video RelatedVideos in MongoDB self.audit_framework_videos_col.update_one( {'id': video_id}, {'$set': {'relatedVideos': video_metadata['relatedVideos']}}, upsert=True ) return video_metadata def crawl_watch_youtube_video(self, video_id, hop_number): """ Method that downloads the metadata of the given YouTube Video and watches the video :param video_id: :param hop_number: :return: """ # Find whether we should watch the video or not if Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO and hop_number < 5: watch_curr_video = True else: watch_curr_video = False # Load YouTube Video Page self.driver.get('https://www.youtube.com/watch?v={}&autoplay=1'.format(video_id)) time.sleep(3) # Check if user is Authenticated before proceeding if self.USER_PROFILE != 'NO_PERSONALIZATION' and not self.is_user_authenticated(): exit(1) # Check if LiveStream try: isLivestream = self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string'))).text if 'started streaming' in isLivestream.lower(): print('[VIDEO: {}] is a LIVESTREAM. Skipping and choosing another video...'.format(video_id)) return None except TimeoutException: pass # Start by Watching the Video if watch_curr_video: try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[4]/button'))).click() except TimeoutException: try: self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[5]/button'))).click() except TimeoutException: pass # Keep the time needed to crawl the video details video_crawl_started = time.time() self.driver.execute_script("window.scrollTo(0, 800)") """ DOWNLOAD VIDEO METADATA """ # Check if Video already exists in MongoDB video_exists = False video_metadata = self.audit_framework_videos_col.find_one({'id': video_id}) if not video_metadata: # Get Video Metadata using YouTube Data API video_metadata = self.get_video_metadata(video_id=video_id, retrieve_recommended_videos=False) if video_metadata is None: return None # Add Video Annotation information video_metadata['classification'] = dict() video_metadata['classification']['classification_category'] = None else: # Set Video Exists flag video_exists = True # # GET RELATED VIDEOS (no matter if the video exists or not) # print('--- [VIDEO: {}] GETTING TOP {} RECOMMENDED VIDEOS...'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD)) related_videos_list = list() related_videos_items = self.driver.find_elements_by_xpath('//*[@id="thumbnail"]') included_related_videos = 0 for related_video_item in related_videos_items: try: related_video_id = related_video_item.get_attribute('href').split('v=')[1] if "&" in related_video_id: related_video_id = related_video_id.split('&')[0] related_videos_list.append(related_video_id) included_related_videos += 1 except (AttributeError, IndexError): continue if Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD == len(related_videos_list): break print('--- [VIDEO: {0}] TOP {1} RECOMMENDED VIDEOS: {2}'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD, related_videos_list)) video_metadata['relatedVideos'] = related_videos_list video_metadata['updatedAt'] = str(dt.now()) # STORE VIDEO INFORMATION IN MongoDB if not video_exists: # Insert Video Details in MongoDB self.audit_framework_videos_col.insert_one(video_metadata) else: # Update Video Details in MongoDB self.audit_framework_videos_col.replace_one({'id': video_id}, video_metadata, upsert=True) # WATCH VIDEO if watch_curr_video: # Calculate Video Crawl Duration video_crawl_ended = time.time() video_crawl_duration_sec = video_crawl_ended - video_crawl_started # Read Video Duration video_duration_sec = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration']) # Calculate the final watch time percentage to watch final_video_duration_sec = int((video_duration_sec * Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO_PERCENTAGE) / 100) final_video_duration_sec = final_video_duration_sec - video_crawl_duration_sec print('[{0}] - Sleeping for {1} seconds to watch the full VIDEO: {2}'.format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), final_video_duration_sec, video_id)) time.sleep(final_video_duration_sec) return video_metadata def search_youtube(self): """ Method that searches YouTube using a predefined SEARCH TERM and returns the Video IDs of the top X videos :return: """ print('[{0}] Searching YouTube with SEARCH TERM: {1}'.format(self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Search YouTube using Se self.driver.get('https://www.youtube.com/results?search_query={}'.format(self.AUDIT_SEARCH_TERM)) time.sleep(3) # Get the TOP search results self.driver.execute_script("window.scrollTo(0, 1500)") search_result_videos = list() search_result_items = self.driver.find_elements_by_xpath('//*[@id="thumbnail"]') for search_result in search_result_items: try: if 'v=' in search_result.get_attribute('href'): if '&list' in search_result.get_attribute('href'): video_id_temp = search_result.get_attribute('href').split('v=')[1] search_result_videos.append(video_id_temp.split('&')[0]) else: search_result_videos.append(search_result.get_attribute('href').split('v=')[1]) if len(search_result_videos) == Config.AUDIT_RANDOM_WALKS_SEARCH_RESULTS_THRESHOLD: break except (TypeError, StaleElementReferenceException): continue return search_result_videos def search_youtube_using_api(self): """ Method that searches YouTube using the YouTube Data API with a predefined SEARCH TERM and returns the Video IDs of the top X videos :return: """ print('[{0}] Searching YouTube with SEARCH TERM: {1}'.format(self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Search YouTube search_result_videos = self.YOUTUBE_VIDEO_DOWNLOADER.search_youtube( search_term=self.AUDIT_SEARCH_TERM, max_search_results=self.AUDIT_SEARCH_TERM ) return search_result_videos def perform_audit(self): """ Method that performs the YouTube Video Recommendations section Audit Experiment for a specific SEARCH TERM for a specific User Profile. More precisely, it performs live Random Walks on YouTube's Recommendation Graph starting from videos returned using a specific SEARCH TERM. :return: """ print('--- [{}]-[{}] RANDOM WALKS FOR SEARCH_TERM: {} STARTED'.format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Initialiaze variables random_walk_cntr = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['CURRENT_RANDOM_WALK'] random_walks_history = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['RANDOM_WALKS_HISTORY'] random_walks_details = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS'] # Perform All Experiment Repetitions while random_walk_cntr < Config.AUDIT_RANDOM_WALKS_TOTAL_REPETITIONS: # Init current random walks details random_walk_started = time.time() curr_random_walk_details = dict() curr_random_walk_history = "" """ DELETE User WATCH HISTORY from the day after the User Profile training """ if self.USER_PROFILE != 'NO_PERSONALIZATION' and self.USER_PROFILE != 'YOUTUBE_DATA_API': self.clear_user_watch_history() """ Get current Random Walk Starting Videos (Search YouTube) """ # Search YouTube if self.USER_PROFILE == 'YOUTUBE_DATA_API': starting_videos = self.search_youtube_using_api() else: starting_videos = self.search_youtube() """ Choose Random Walk Starting Video randomly from the search results """ curr_selected_video_metadata = None while curr_selected_video_metadata is None: curr_selected_video_id = starting_videos[random.randrange(start=0, stop=len(starting_videos), step=1)] # Crawl randomly selected Video Metadata print('\n[{}]-[RANDOM WALK: {}/{} | HOP: 0/{}] Getting Metadata of VIDEO: {}'.format(self.USER_PROFILE, random_walk_cntr + 1, Config.AUDIT_RANDOM_WALKS_TOTAL_REPETITIONS, Config.AUDIT_RANDOM_WALKS_MAX_HOPS, curr_selected_video_id)) if self.USER_PROFILE == 'YOUTUBE_DATA_API': curr_selected_video_metadata = self.crawl_youtube_video_using_api(video_id=curr_selected_video_id) else: curr_selected_video_metadata = self.crawl_watch_youtube_video(video_id=curr_selected_video_id, hop_number=0) # Add HOP 0 selected Video Details curr_random_walk_details['hop_0'] = dict() curr_random_walk_details['hop_0']['video_id'] = curr_selected_video_id curr_random_walk_details['hop_0']['label'] = None curr_random_walk_details['hop_0']['relatedVideos'] = curr_selected_video_metadata['relatedVideos'] # Add Video ID to the current Random Walk history curr_random_walk_history += '{}_'.format(curr_selected_video_id) # TODO: change underscore to another character not used in YouTube VideoIds """ Perform Random Walk """ for hop_num in range(1, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1): # LOOP FROM HOP_1 to HOP_5 """ [STEP 1] Get the Recommended all_videos of the visited Video of the previous Hop """ previous_hop_video_recommendations = curr_selected_video_metadata['relatedVideos'] if len(previous_hop_video_recommendations) == 0: break """ [STEP 2] Select randomly the Video to visit in the current Hop from among the recommended all_videos of the previous Hop's Video """ curr_selected_video_metadata = None while curr_selected_video_metadata is None: try: curr_selected_video_id = previous_hop_video_recommendations[random.randrange(start=0, stop=len(previous_hop_video_recommendations), step=1)] # Add Video ID to the current Random Walk history curr_random_walk_history += '{}_'.format(curr_selected_video_id) except ValueError: continue """ [STEP 3] Crawl randomly selected current Hop's Video Details """ print('[{}]-[RANDOM WALK: {} | HOP: {}/{}] Getting information of VideoID: {}'.format(self.USER_PROFILE, random_walk_cntr + 1, hop_num, Config.AUDIT_RANDOM_WALKS_MAX_HOPS, curr_selected_video_id)) if self.USER_PROFILE == 'YOUTUBE_DATA_API': curr_selected_video_metadata = self.crawl_youtube_video_using_api(video_id=curr_selected_video_id) else: curr_selected_video_metadata = self.crawl_watch_youtube_video(video_id=curr_selected_video_id, hop_number=hop_num) """ [STEP 4] Add current HOP selected Video Details """ curr_random_walk_details['hop_{}'.format(hop_num)] = dict() curr_random_walk_details['hop_{}'.format(hop_num)]['video_id'] = curr_selected_video_id curr_random_walk_details['hop_{}'.format(hop_num)]['label'] = None curr_random_walk_details['hop_{}'.format(hop_num)]['relatedVideos'] = curr_selected_video_metadata['relatedVideos'] # Ensure that this Random Walk is Unique isCurrentRandomWalkUnique = True for random_walk in random_walks_history: if curr_random_walk_history == random_walk: isCurrentRandomWalkUnique = False break # Add current Random Walk details to the list of all the Random Walks details if isCurrentRandomWalkUnique: # Update the list with all the unique random walks history random_walks_history.append(curr_random_walk_history) # Update Random Walks details with the current Random Walk random_walks_details.append(curr_random_walk_details) # Increase Random Walks Counter random_walk_cntr += 1 # Update RANDOM WALKS DETAILS file self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['STATUS'] = 'RUNNING' self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['CURRENT_RANDOM_WALK'] = random_walk_cntr self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['RANDOM_WALKS_HISTORY'] = random_walks_history self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS'] = random_walks_details self.save_youtube_random_walks_details() random_walk_ended = time.time() print('--- RANDOM WALK took {:.2f} mins.'.format((random_walk_ended - random_walk_started) / 60)) print('---[{}] RANDOM WALKS using SEARCH_TERM {} COMPLETED!'.format(self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Update RANDOM WALKS DETAILS file self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['STATUS'] = 'COMPLETED' self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['CURRENT_RANDOM_WALK'] = Config.AUDIT_RANDOM_WALKS_TOTAL_REPETITIONS self.save_youtube_random_walks_details() # INSERT Random Walks Details to the Database random_walks_details_db = dict() random_walks_details_db['user_profile_type'] = self.USER_PROFILE random_walks_details_db['seed_search_term_topic'] = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['SEED_SEARCH_TERM'] random_walks_details_db['search_results_threshold'] = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['SEARCH_RESULTS_THRESHOLD'] random_walks_details_db['random_walks_max_hops'] = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['RANDOM_WALK_MAX_HOPS'] random_walks_details_db['random_walks_history'] = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['RANDOM_WALKS_HISTORY'] random_walks_details_db['random_walks_details'] = self.YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS['YOUTUBE_RANDOM_WALKS_AUDIT_DETAILS'] self.audit_framework_youtube_video_recommendations.insert_one(random_walks_details_db) return
class YouTubeSearchAudit(object): """ Class that provides all the methods to perform audit experiments on YouTube Search Results with logged-in users, non-logged-in users, and the YouTube Data API, while also assessing the effects of personalization on YouTube's Video recommendations """ def __init__(self, user_profile, search_term): """ Constructor :param user_profile: the User Profile nickname to perform the experiment :param search_term: the search term to search YouTube """ # Initialize Variables self.USER_PROFILE = user_profile self.AUDIT_SEARCH_TERM = search_term """ Configure Selenium ChromeDriver """ if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit( '[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"' ) self.driverOptions.add_argument( '--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option( 'useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument( "user-data-dir={0}/{1}-data".format( Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format( Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format( Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome( options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) """ MongoDB Configuration """ # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections name self.audit_framework_videos_col = self.db[ Config.AUDIT_FRAMEWORK_VIDEOS_COL] self.audit_framework_youtube_search_col = self.db[ Config.AUDIT_FRAMEWORK_YOUTUBE_SEARCH_COL] # Load YouTube Homepage Audit Experiment latest details from the logfile self.YOUTUBE_SEARCH_AUDIT_DETAILS = self.load_youtube_search_experiment_details( ) """ YOUTUBE DATA API HELPER """ # Create a YouTube Video Helper self.YOUTUBE_VIDEO_DOWNLOADER = YouTubeVideoDownloader() return def __del__(self): # Close Selenium Browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.close_selenium_browser() return def __exit__(self, exc_type, exc_val, exc_tb): # Close Selenium Browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.close_selenium_browser() return def close_selenium_browser(self): # Close Selenium browser if self.USER_PROFILE != 'YOUTUBE_DATA_API': self.driver.close() return def get_user_proxy_server(self): """ Method that finds the proxy server of the User Profile :return: """ user_profiles_info = Utils.read_json_file( filename=Config.USER_PROFILES_INFO_FILENAME) for user_profile in user_profiles_info: if user_profile['nickname'] == self.USER_PROFILE: return user_profile['proxy'] print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'. format(self.USER_PROFILE)) sys.exit(errno.ECANCELED) def load_youtube_search_experiment_details(self): """ Method that reads the YouTube Live Random Walks Details (latest status) file :return: a JSON dict with the contents of the file """ # Read status details from file if it exists if os.path.isfile( Config.AUDIT_YOUTUBE_SEARCH_LOGFILE.format( self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE)): with open(file=Config.AUDIT_YOUTUBE_SEARCH_LOGFILE.format( self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE)) as file: return dict(json.load(file)) # Create a new JSON dict and return search_experiment_details = { 'STATUS': 'STOPPED', 'USER_PROFILE_TYPE': self.USER_PROFILE, 'SEARCH_TERM': self.AUDIT_SEARCH_TERM, 'SEARCH_RESULTS_THRESHOLD': Config.AUDIT_SEARCH_RESULTS_THRESHOLD, 'EXPERIMENT_TOTAL_REPETITIONS': Config.AUDIT_YOUTUBE_SEARCH_TOTAL_REPETITIONS, 'CURRENT_EXPERIMENT_REPETITION': 0, 'SEARCH_EXPERIMENT_DETAILS': [] } return search_experiment_details def save_youtube_search_experiment_details(self): """ Method that writes the provided YouTube Search Audit experiment details in a file :return: None """ print(json.dumps(self.YOUTUBE_SEARCH_AUDIT_DETAILS, sort_keys=False, ensure_ascii=False, indent=4), file=open(file=Config.AUDIT_YOUTUBE_SEARCH_LOGFILE.format( self.AUDIT_SEARCH_TERM.replace(' ', '-'), self.USER_PROFILE), mode='w')) return def clear_user_watch_history(self): """ Method that deletes the YouTube's Watch History of the logged-in User Profile after and onwards the provided date :return: """ # Load YouTube Activity Control Management page self.driver.get( 'https://myactivity.google.com/activitycontrols/youtube?utm_source=my-activity&hl=en' ) time.sleep(2) # Click "Deleted Activity by" button try: self.wait.until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[2]/header/div[4]/div[2]/div/c-wiz/div/div/nav/a[3]' ))).click() except TimeoutException: # Click the other Delete button self.wait.until( EC.presence_of_element_located(( By.XPATH, '/html/body/c-wiz/div/div[2]/c-wiz[1]/div/div/div[2]/div[2]/div/button' ))).click() time.sleep(2) if Config.DELETE_WATCH_HISTORY_AFTER_DATE is not None: # Select to DELETE CUSTOM RANGE Activity self.wait.until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[4]' ))).click() time.sleep(3) # Select After Date from the calendar try: self.wait.until( EC.element_to_be_clickable(( By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[3]/div/div[2]/div[1]/div/div[1]/div[1]/div/span/span/div' ))).click() except TimeoutException: self.wait.until( EC.element_to_be_clickable(( By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[4]/div[2]' ))).click() time.sleep(2) try: self.driver.find_element_by_xpath( "//*[@data-day-of-month='{}']".format( Config.DELETE_WATCH_HISTORY_AFTER_DATE.split('-') [0])).click() except TimeoutException: self.wait.until( EC.element_to_be_clickable( (By.XPATH, "//*[@data-day-of-month='{}']".format( Config.DELETE_WATCH_HISTORY_AFTER_DATE.split('-') [0])))).click() pass time.sleep(2) # Click the DELETE button to proceed self.wait.until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[4]/div/div[2]/button' ))).click() else: # Select to DELETE ALL Activity self.wait.until( EC.presence_of_element_located(( By.XPATH, '/html/body/div[7]/div/div[2]/span/div[2]/div/c-wiz/div/div[3]/ul/li[3]' ))).click() # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) time.sleep(3) try: # Click the CONFIRM DELETE BUTTON (If it exists because if it does not then it means that there is no activity) self.driver.find_element_by_xpath( '/html/body/div[7]/div/div[2]/span/div[2]/div[1]/c-wiz/div/div[4]/div/div[2]/button' ).click() except NoSuchElementException: # Let it pass since it means that there is no watch history to delete print('[{}] There is no Watch History to delete'.format( self.USER_PROFILE)) pass return def is_user_authenticated(self): """ Method that verifies whether the user is authenticated or not """ self.driver.get('https://www.youtube.com') time.sleep(3) try: self.wait.until( EC.presence_of_element_located( (By.XPATH, '//*[@id="avatar-btn"]'))) print('--- USER IS AUTHENTICATED') return True except (TimeoutError, NoSuchElementException): print('--- USER IS NOT AUTHENTICATED') return False def get_video_metadata(self, video_id): """ Method that downloads the metadata of the given YouTube Video using YouTube Data API :param video_id: YouTube Video Id :return: """ # Get Video Metadata video_metadata = self.YOUTUBE_VIDEO_DOWNLOADER.download_video_metadata( video_id=video_id, retrieve_recommended_videos=False) # Add additional information video_metadata['retrievedAt'] = str(dt.now()) video_metadata['statistics'] = dict() # video_metadata['relatedVideos'] = dict() return video_metadata def crawl_youtube_video(self, video_id): """ Method that scrapes the information of a given YouTube Video Id :return: the information of the given video """ # Check if user is Authenticated before proceeding if self.USER_PROFILE != 'NO_PERSONALIZATION' and self.USER_PROFILE != 'YOUTUBE_DATA_API' and not self.is_user_authenticated( ): exit(1) """ DOWNLOAD VIDEO METADATA """ # Check if Video already exists in MongoDB video_metadata = self.audit_framework_videos_col.find_one( {'id': video_id}) if not video_metadata: # Get Video Metadata using YouTube Data API video_metadata = self.get_video_metadata(video_id=video_id) if video_metadata is None: return None # Add Video Annotation information video_metadata['classification'] = dict() video_metadata['classification']['classification_category'] = None # Insert video to MongoDB self.audit_framework_videos_col.insert_one(video_metadata) return video_metadata def search_youtube(self): """ Method that searches YouTube using a predefined SEARCH TERM and returns the Video IDs of the top X videos :return: """ print('[{0}] Searching YouTube with SEARCH TERM: {1}'.format( self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Search YouTube using Se self.driver.get( 'https://www.youtube.com/results?search_query={}'.format( self.AUDIT_SEARCH_TERM)) time.sleep(3) # Get the TOP search results self.driver.execute_script("window.scrollTo(0, 1500)") search_result_videos = list() search_result_items = self.driver.find_elements_by_xpath( '//*[@id="thumbnail"]') for search_result in search_result_items: try: if 'v=' in search_result.get_attribute('href'): if '&list' in search_result.get_attribute('href'): video_id_temp = search_result.get_attribute( 'href').split('v=')[1] search_result_videos.append( video_id_temp.split('&')[0]) else: search_result_videos.append( search_result.get_attribute('href').split('v=')[1]) if len(search_result_videos ) == Config.AUDIT_SEARCH_RESULTS_THRESHOLD: break except (TypeError, StaleElementReferenceException): continue return search_result_videos def search_youtube_using_api(self): """ Method that searches YouTube using the YouTube Data API with a predefined SEARCH TERM and returns the Video IDs of the top X videos :return: """ print('[{0}] Searching YouTube with SEARCH TERM: {1}'.format( self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Search YouTube search_result_videos = self.YOUTUBE_VIDEO_DOWNLOADER.search_youtube( search_term=self.AUDIT_SEARCH_TERM, max_search_results=self.AUDIT_SEARCH_TERM) return search_result_videos def perform_audit(self): """ Method that performs the YouTube Search Audit Experiment for a specific SEARCH TERM for a specific User Profile """ print('--- [{}] EXPERIMENT FOR SEARCH_TERM: {} STARTED at {}'.format( self.USER_PROFILE, self.AUDIT_SEARCH_TERM, dt.now().strftime("%d-%m-%Y %H:%M:%S"))) # Initialize variables repetitions_cntr = self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] audit_experiment_details = self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'SEARCH_EXPERIMENT_DETAILS'] # Clear watch history # https://myactivity.google.com/activitycontrols/youtube?utm_source=my-activity&hl=en if self.USER_PROFILE != 'NO_PERSONALIZATION' and self.USER_PROFILE != 'YOUTUBE_DATA_API': self.clear_user_watch_history() # Perform All Experiment Repetitions while repetitions_cntr < Config.AUDIT_YOUTUBE_SEARCH_TOTAL_REPETITIONS: # Init current repetition details curr_repetition_details = dict() print( '\n--- [{}]-[{}/{}] Experiment repetition for SEARCH TERM {} STARTED' .format(self.USER_PROFILE, repetitions_cntr + 1, Config.AUDIT_YOUTUBE_SEARCH_TOTAL_REPETITIONS, self.AUDIT_SEARCH_TERM)) # Perform YouTube Search Audit experiment repetition search_results = list() while len(search_results) < Config.AUDIT_SEARCH_RESULTS_THRESHOLD: if self.USER_PROFILE == 'YOUTUBE_DATA_API': search_results = self.search_youtube_using_api() else: search_results = self.search_youtube() curr_repetition_details['SEARCH_RESULTS'] = search_results curr_repetition_details['CRAWLED_VIDEOS'] = list() curr_repetition_details['CRAWLED_VIDEOS_DETAILS'] = list() # Crawl current Experiment Repetition Video Details crawled_videos_counter = 1 for video_id in search_results: # Get Video Metadata print( '--- [{}] SEARCH TERM: {} | [EXP_ID: {}] | Crawling video information {}/{} with ID: {}' .format(self.USER_PROFILE, self.AUDIT_SEARCH_TERM, repetitions_cntr + 1, crawled_videos_counter, len(search_results), video_id)) if video_id not in curr_repetition_details['CRAWLED_VIDEOS']: self.get_video_metadata(video_id=video_id) curr_repetition_details['CRAWLED_VIDEOS'].append(video_id) curr_repetition_details['CRAWLED_VIDEOS_DETAILS'].append({ 'video_id': video_id, 'label': None }) crawled_videos_counter += 1 # Update Homepage Audit Experiment Logs audit_experiment_details.append(curr_repetition_details) # Increase Audit Experiment repetition counter repetitions_cntr += 1 # Update Experiment's Details File self.YOUTUBE_SEARCH_AUDIT_DETAILS['STATUS'] = 'RUNNING' self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] = repetitions_cntr self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'SEARCH_EXPERIMENT_DETAILS'] = audit_experiment_details self.save_youtube_search_experiment_details() # Sleep for 10 minutes between each Experiment repetition to avoid the Carry Over Effect print( '--- [{}] - [{}] Sleeping for 10 minutes before repetition {}'. format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), self.USER_PROFILE, repetitions_cntr + 1)) time.sleep(10 * 60) print( '\n--- [{}] YOUTUBE SEARCH EXPERIMENT using SEARCH_TERM {} COMPLETED!' .format(self.USER_PROFILE, self.AUDIT_SEARCH_TERM)) # Update YouTube Search Audit Experiment Logs self.YOUTUBE_SEARCH_AUDIT_DETAILS['STATUS'] = 'COMPLETED' self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] = repetitions_cntr self.save_youtube_search_experiment_details() # INSERT Audit Experiment Details to the Database experiment_details_db = dict() experiment_details_db['user_profile_type'] = self.USER_PROFILE experiment_details_db['search_term'] = self.AUDIT_SEARCH_TERM experiment_details_db[ 'search_results_threshold'] = Config.AUDIT_SEARCH_RESULTS_THRESHOLD experiment_details_db[ 'total_repetitions'] = self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] experiment_details_db[ 'experiment_details'] = self.YOUTUBE_SEARCH_AUDIT_DETAILS[ 'SEARCH_EXPERIMENT_DETAILS'] self.audit_framework_youtube_search_col.insert_one( experiment_details_db) return
class YouTubeHomepageAudit(object): """ Class that provides all the methods to perform audit experiments on YouTube's Homepage with logged-in users, non-logged-in users, and the YouTube Data API while assessing the effects of personalization on YouTube's Video recommendations """ def __init__(self, user_profile): # Initialize Variables self.USER_PROFILE = user_profile """ Configure Selenium ChromeDriver """ self.driverOptions = ChromeOptions() # Set User-Agent self.driverOptions.add_argument(Config.USER_AGENT) # Set whether headless or not if Config.HEADLESS: self.driverOptions.add_argument("--headless") self.driverOptions.headless = Config.HEADLESS # Set HTTPS Proxy Server if Config.USE_PROXY: user_proxy = self.get_user_proxy_server() if user_proxy == 'HOST:PORT': exit( '[ERROR] Please set correct HTTPS Proxies in: "youtubeauditframework/userprofiles/info/user_profiles_info.json"' ) self.driverOptions.add_argument( '--proxy-server={}'.format(user_proxy)) # Disable Automation Flags self.driverOptions.add_experimental_option("excludeSwitches", ["enable-automation"]) self.driverOptions.add_experimental_option('useAutomationExtension', False) self.driverOptions.add_argument('--disable-web-security') self.driverOptions.add_argument('--allow-running-insecure-content') # Set User Profile unique Data directory self.driverOptions.add_argument("user-data-dir={0}/{1}-data".format( Config.USER_PROFILE_DATA_DIR, self.USER_PROFILE)) # Find ChromeDriver self.webdriver_executable = '{0}/chromedriver_{1}'.format( Config.CHROMEDRIVER_BASE_DIR, self.USER_PROFILE) if not os.path.isfile(self.webdriver_executable): copyfile(src='{0}/chromedriver'.format( Config.CHROMEDRIVER_BASE_DIR), dst=self.webdriver_executable) # Create ChromeDriver self.driver = webdriver.Chrome( options=self.driverOptions, executable_path=self.webdriver_executable) # Maximize Window self.driver.maximize_window() self.wait = WebDriverWait(self.driver, Config.WEBDRIVER_ELEMENT_DELAY) """ MongoDB Configuration """ # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections name self.audit_framework_videos_col = self.db[ Config.AUDIT_FRAMEWORK_VIDEOS_COL] self.audit_framework_youtube_homepage_col = self.db[ Config.AUDIT_FRAMEWORK_YOUTUBE_HOMEPAGE_COL] # Load YouTube Homepage Audit Experiment latest details from the logfile self.HOMEPAGE_AUDIT_DETAILS = self.load_youtube_homepage_audit_details( ) """ YOUTUBE DATA API HELPER """ # Create a YouTube Video Helper self.YOUTUBE_VIDEO_DOWNLOADER = YouTubeVideoDownloader() return def __del__(self): # Close Selenium Browser self.close_selenium_browser() return def __exit__(self, exc_type, exc_val, exc_tb): # Close Selenium Browser self.close_selenium_browser() return def close_selenium_browser(self): # Close Selenium browser self.driver.close() return def get_user_proxy_server(self): """ Method that finds the proxy server of the User Profile :return: """ user_profiles_info = Utils.read_json_file( filename=Config.USER_PROFILES_INFO_FILENAME) for user_profile in user_profiles_info: if user_profile['nickname'] == self.USER_PROFILE: return user_profile['proxy'] print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'. format(self.USER_PROFILE)) sys.exit(errno.ECANCELED) def load_youtube_homepage_audit_details(self): """ Method that reads the YouTube Live Random Walks Details (latest status) file :return: a JSON dict with the contents of the file """ # Read status details from file if it exists if os.path.isfile( Config.AUDIT_YOUTUBE_HOMEPAGE_LOGFILE.format( self.USER_PROFILE)): with open(file=Config.AUDIT_YOUTUBE_HOMEPAGE_LOGFILE.format( self.USER_PROFILE)) as file: return dict(json.load(file)) # Create a new JSON dict and return homepage_audit_details = { 'STATUS': 'STOPPED', 'USER_PROFILE_TYPE': self.USER_PROFILE, 'HOMEPAGE_VIDEOS_THRESHOLD': Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD, 'EXPERIMENT_TOTAL_REPETITIONS': Config.AUDIT_HOMEPAGE_TOTAL_REPETITIONS, 'CURRENT_EXPERIMENT_REPETITION': 0, 'HOMEPAGE_EXPERIMENT_DETAILS': [] } return homepage_audit_details def save_youtube_homepage_experiment_details(self): """ Method that writes the provided YouTube Recommendation Monitor details in a file :return: """ print(json.dumps(self.HOMEPAGE_AUDIT_DETAILS, sort_keys=False, ensure_ascii=False, indent=4), file=open(file=Config.AUDIT_YOUTUBE_HOMEPAGE_LOGFILE.format( self.USER_PROFILE), mode='w')) return def is_user_authenticated(self): """ Method that verifies whether the user is authenticated or not """ self.driver.get('https://www.youtube.com') time.sleep(3) try: self.wait.until( EC.presence_of_element_located( (By.XPATH, '//*[@id="avatar-btn"]'))) print('--- USER IS AUTHENTICATED') return True except (TimeoutError, NoSuchElementException): print('--- USER IS NOT AUTHENTICATED') return False def get_video_metadata(self, video_id): """ Method that downloads the metadata of the given YouTube Video using YouTube Data API :param video_id: YouTube Video Id :return: """ # Get Video Metadata video_metadata = self.YOUTUBE_VIDEO_DOWNLOADER.download_video_metadata( video_id=video_id, retrieve_recommended_videos=False) # Add additional information video_metadata['retrievedAt'] = str(dt.now()) video_metadata['statistics'] = dict() # video_metadata['relatedVideos'] = dict() return video_metadata def crawl_youtube_video(self, video_id): """ Method that scrapes the information of a given YouTube Video Id :return: the information of the given video """ # Check if user is Authenticated before proceeding if self.USER_PROFILE != 'NO_PERSONALIZATION' and not self.is_user_authenticated( ): exit(1) """ DOWNLOAD VIDEO INFORMATION """ # Check if Video already exists in MongoDB video_metadata = self.audit_framework_videos_col.find_one( {'id': video_id}) if not video_metadata: # Get Video Metadata using YouTube Data API video_metadata = self.get_video_metadata(video_id=video_id) if video_metadata is None: return None # Add Video Annotation information video_metadata['classification'] = dict() video_metadata['classification']['classification_category'] = None # Insert video to MongoDB self.audit_framework_videos_col.insert_one(video_metadata) return video_metadata def get_homepage_top_videos(self): """ Method that loads the YouTube Homepage of a user and returns the Top X Video Ids :return: """ print('--- [{}] Getting User\'s YouTube Homepage Videos'.format( self.USER_PROFILE)) # Load YouTube User Homepage self.driver.get('https://www.youtube.com/') time.sleep(5) # Get the Top X Videos from the User's YouTube Homepage self.driver.execute_script("window.scrollTo(0, 1700)") time.sleep(7) homepage_videos = list() homepage_video_items = self.driver.find_elements_by_xpath( '//*[@id="thumbnail"]') for video_item in homepage_video_items: try: if 'v=' in video_item.get_attribute('href'): if '&list' in video_item.get_attribute('href'): video_id_temp = video_item.get_attribute('href').split( 'v=')[1] homepage_videos.append(video_id_temp.split('&')[0]) else: homepage_videos.append( video_item.get_attribute('href').split('v=')[1]) if len(homepage_videos ) == Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD: break except (TypeError, StaleElementReferenceException): continue return homepage_videos def perform_audit(self): """ Method that performs the audit of User's YouTube Homepage :return: """ print('--- [{}] YOUTUBE HOMEPAGE AUDIT STARTED'.format( self.USER_PROFILE)) # Initialize variables repetitions_cntr = self.HOMEPAGE_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] audit_experiment_details = self.HOMEPAGE_AUDIT_DETAILS[ 'HOMEPAGE_EXPERIMENT_DETAILS'] # Perform All Experiment Repetitions while repetitions_cntr < Config.AUDIT_HOMEPAGE_TOTAL_REPETITIONS: # Init current repetition details curr_repetition_details = dict() print('\n--- [{}]-[{}/{}] Experiment Repetition STARTED'.format( self.USER_PROFILE, repetitions_cntr + 1, Config.AUDIT_HOMEPAGE_TOTAL_REPETITIONS)) # Perform YouTube's Homepage Audit experiment repetition user_homepage_top_videos = list() while len(user_homepage_top_videos ) < Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD: user_homepage_top_videos = self.get_homepage_top_videos() curr_repetition_details[ 'USER_HOMEPAGE_VIDEOS'] = user_homepage_top_videos curr_repetition_details['CRAWLED_VIDEOS'] = list() curr_repetition_details['CRAWLED_VIDEOS_DETAILS'] = list() # Download the metadata of all the videos in the Homepage of the User Profile crawled_videos_counter = 1 for video_id in user_homepage_top_videos: print('--- [{}]-[EXP_ID: {}] Crawling Video {}/{} with ID: {}'. format(self.USER_PROFILE, repetitions_cntr + 1, crawled_videos_counter, len(user_homepage_top_videos), video_id)) # Download Video Metadata if video_id not in curr_repetition_details['CRAWLED_VIDEOS']: self.get_video_metadata(video_id=video_id) curr_repetition_details['CRAWLED_VIDEOS'].append(video_id) curr_repetition_details['CRAWLED_VIDEOS_DETAILS'].append({ 'video_id': video_id, 'label': None }) crawled_videos_counter += 1 # Update Homepage Audit Experiment Logs audit_experiment_details.append(curr_repetition_details) # Increase Audit Experiment repetition counter repetitions_cntr += 1 # Update Experiment's Details File self.HOMEPAGE_AUDIT_DETAILS['STATUS'] = 'RUNNING' self.HOMEPAGE_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] = repetitions_cntr self.HOMEPAGE_AUDIT_DETAILS[ 'HOMEPAGE_EXPERIMENT_DETAILS'] = audit_experiment_details self.save_youtube_homepage_experiment_details() # Sleep for 10 minutes between each Experiment repetition to avoid the Carry Over Effect print( '--- [{}] - [{}] Sleeping for 10 minutes before repetition {}'. format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), self.USER_PROFILE, repetitions_cntr + 1)) time.sleep(10 * 60) print('\n--- [{}] YOUTUBE HOMEPAGE AUDIT EXPERIMENT COMPLETED!'.format( self.USER_PROFILE)) # Update Homepage Audit Experiment Logs self.HOMEPAGE_AUDIT_DETAILS['STATUS'] = 'COMPLETED' self.HOMEPAGE_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] = repetitions_cntr self.save_youtube_homepage_experiment_details() # INSERT Audit Experiment Details to the Database experiment_details_db = dict() experiment_details_db['user_profile_type'] = self.USER_PROFILE experiment_details_db[ 'homepage_videos_threshold'] = self.HOMEPAGE_AUDIT_DETAILS[ 'HOMEPAGE_VIDEOS_THRESHOLD'] experiment_details_db[ 'total_repetitions'] = self.HOMEPAGE_AUDIT_DETAILS[ 'CURRENT_EXPERIMENT_REPETITION'] experiment_details_db[ 'experiment_details'] = self.HOMEPAGE_AUDIT_DETAILS[ 'HOMEPAGE_EXPERIMENT_DETAILS'] self.audit_framework_youtube_homepage_col.insert_one( experiment_details_db) return
class DownloadAnnotateExperimentsVideos(object): """ Class that downloads all the required information and annotates all the videos encountered during our experiments """ def __init__(self): # # MongoDB Configuration # # Host and Port self.client = MongoClient('localhost', 27017) # DB name self.db = self.client[Config.DB_NAME] # Collections # self.audit_framework_videos_col = self.db.audit_framework_videos self.audit_framework_videos_col = self.db[Config.AUDIT_FRAMEWORK_VIDEOS_COL] # Create a YouTube Video Downloader Object self.VIDEO_DOWNLOADER = YouTubeVideoDownloader() # Create Video Classifier Object self.VIDEO_ANNOTATOR = PseudoscienceClassifier() return def get_all_notannotated_videos(self): """ Method that returns a list with all the YouTube Video encountered during the experiments and have not been annotated :return: """ all_notannotated_videos = self.audit_framework_videos_col.find({ '$and': [ {'classification.classification_category': None} ] }) return [video_info['id'] for video_info in all_notannotated_videos] def delete_videos_labels(self): """ Method that deletes the label of all the videos in the collection """ self.audit_framework_videos_col.update_many({}, {'$unset': {'classification': 1}}) return def annotate_videos(self): """ Method that annotates all the non-annotated videos :return: """ # Get all not annotated videos all_videos = self.get_all_notannotated_videos() # Download the information adn annotate videos progressBar = tqdm(total=len(all_videos)) for video_id in all_videos: print('\n--- [VIDEO: {}] DOWNLOADING INFORMATION AND ANNOTATING VIDEO'.format(video_id)) # Get Video Details video_details = self.audit_framework_videos_col.find_one({'id': video_id}) # Download Video Comments self.VIDEO_DOWNLOADER.download_video_comments(video_id=video_id) # Download Video Transcript self.VIDEO_DOWNLOADER.download_video_transcript(video_id=video_id) # Annotate Video video_label, confidence_score = self.VIDEO_ANNOTATOR.classify(video_details=video_details) # Update Video Information self.audit_framework_videos_col.update_one({'id': video_id}, {'$set': {'classification.classification_category': video_label}}) # Sleep to avoid IP address banning when downloading videos' transcript # IMPORTANT: Don't change this to less than 5 seconds time.sleep(5) progressBar.update(1) progressBar.close() return