Пример #1
0
 def get_user_profile_watch_videos_from_file(self):
     """
     Method that returns a list with all the YouTube Videos IDs that the current
     User Profile will watch to build a watch history
     :return:
     """
     return Utils.read_file(filename='{0}/{1}_watch_history_videos.txt'.format(Config.USER_PROFILES_WATCH_VIDEOS_BASE_DIR, self.USER_PROFILE))
 def get_video_label(self, video_id):
     """
     Method that returns the label of the given YouTube Video from MongoDB
     """
     video_details = self.audit_framework_videos_col.find_one({'id': video_id}, {'classification': 1})
     if Utils.key_exists(video_details, 'classification'):
         return video_details['classification']['classification_category']
     else:
         print('[VIDEO: {}] Video not classified yet. Exiting...'.format(video_id))
         sys.exit(0)
Пример #3
0
 def get_user_proxy_server(self):
     """
     Method that finds the proxy server of the User Profile
     :return:
     """
     user_profiles_info = Utils.read_json_file(filename=Config.USER_PROFILES_INFO_FILENAME)
     for user_profile in user_profiles_info:
         if user_profile['nickname'] == self.USER_PROFILE:
             return user_profile['proxy']
     print('[{0}] Cannot find the HTTPS Proxy server of this User Profile'.format(self.USER_PROFILE))
     sys.exit(errno.ECANCELED)
Пример #4
0
 def get_video_duration(self, video_id):
     """
     Method that returns the duration of the given YouTube Video
     :param video_id: YouTube Video Id
     :return:
     """
     # Get Video Metadata
     video_metadata = self.YOUTUBE_DOWNLOADER.download_video_metadata(video_id=video_id, retrieve_recommended_videos=False)
     # Convert Video duration to seconds
     video_duration_seconds = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration'])
     return video_duration_seconds
Пример #5
0
    def analyze_audit_experiments(self):
        """
        Method that analyzes the YouTube Search Audit experiments repetitions
        considering only unique videos and analyzes the videos incremental for
        each number of top N videos in the YouTube Homepage of a user
        """
        # Iterate each User Profile and calculate its plot values
        for USER_PROFILE in self.USER_PROFILES:
            print('\n--- Analyzing results for USER PROFILE: {}\n'.format(
                USER_PROFILE))

            # Iterate through the keywords for each User Profile
            for SEARCH_TERM in self.CONSIDERED_SEARCH_TERMS:
                print('\n--- [{}] Analyzing results for SEARCH TERM {}'.format(
                    USER_PROFILE, SEARCH_TERM))

                # Get YouTube Search Results for the current User Profile and Search Term
                curr_user_search_term_exp_details = self.audit_framework_youtube_search_col.find_one(
                    {
                        '$and': [{
                            'user_profile_type': USER_PROFILE
                        }, {
                            'search_term': SEARCH_TERM
                        }]
                    }, {
                        'experiment_details': 1,
                        'experiment_analysis': 1
                    })

                if not curr_user_search_term_exp_details:
                    print(
                        '[{}] YouTube Search Experiment for SEARCH TERM {} has a problem'
                        .format(USER_PROFILE, SEARCH_TERM))
                    return None
                if Utils.key_exists(curr_user_search_term_exp_details,
                                    'experiment_analysis'):
                    print(
                        '[{}] Incremental analysis for the current YouTube Search Experiment for SEARCH TERM {} already performed'
                        .format(USER_PROFILE, SEARCH_TERM))
                    continue

                # Declare variables
                curr_search_term_experiment_analysis = list()
                progressBar = tqdm(total=Config.AUDIT_SEARCH_RESULTS_THRESHOLD)
                for n_top_search_results_videos in range(
                        1, Config.AUDIT_SEARCH_RESULTS_THRESHOLD + 1):
                    # Declare necessary variables for calculation
                    pseudoscience_videos_found = list()
                    all_videos_seen = list()

                    # Iterate Experiment Repetitions
                    for experiment_repetition in curr_user_search_term_exp_details[
                            'experiment_details']:
                        # Iterate videos of the current repetition
                        for video_id in experiment_repetition[
                                'CRAWLED_VIDEOS'][:
                                                  n_top_search_results_videos]:
                            # Add to the list of seen videos
                            all_videos_seen.append(video_id)
                            # Get Video Label
                            curr_video_label = self.get_video_label(
                                video_id=video_id)
                            if curr_video_label == 'pseudoscience':
                                pseudoscience_videos_found.append(video_id)

                    # Calculate analysis results for the current number of homepage videos
                    search_term_experiment_analysis = dict()
                    search_term_experiment_analysis['total_videos_seen'] = len(
                        all_videos_seen)
                    search_term_experiment_analysis[
                        'total_unique_videos_seen'] = len(
                            list(set(all_videos_seen)))
                    search_term_experiment_analysis[
                        'pseudoscience_videos_found'] = pseudoscience_videos_found
                    search_term_experiment_analysis[
                        'total_pseudoscience_videos_found'] = len(
                            pseudoscience_videos_found)
                    search_term_experiment_analysis[
                        'total_unique_pseudoscience_videos_found'] = len(
                            list(set(pseudoscience_videos_found)))
                    search_term_experiment_analysis[
                        'average_pseudoscience_videos_total'] = (
                            len(pseudoscience_videos_found) /
                            len(all_videos_seen)) * 100
                    search_term_experiment_analysis[
                        'average_pseudoscience_videos_unique'] = (
                            len(list(set(pseudoscience_videos_found))) /
                            len(list(set(all_videos_seen)))) * 100
                    # Add to the list with all the results
                    curr_search_term_experiment_analysis.append(
                        search_term_experiment_analysis)

                    progressBar.update(1)
                progressBar.close()
                """ Insert YouTube Search Audit Analysis results into MongoDB """
                self.audit_framework_youtube_search_col.update_one(
                    {
                        '$and': [{
                            'user_profile_type': USER_PROFILE
                        }, {
                            'search_term': SEARCH_TERM
                        }]
                    }, {
                        '$set': {
                            'experiment_analysis':
                            curr_search_term_experiment_analysis
                        }
                    })
        return
    def analyze_audit_experiments(self):
        """
        Method that analyzes the YouTube Homepage Audit experiments repetitions
        considering only unique videos and analyzes the videos incremental for
        each number of top N videos in the YouTube Homepage of a user
        """
        # Iterate each User Profile and calculate its plot values
        for USER_PROFILE in self.USER_PROFILES:
            print('\n--- Analyzing results for USER PROFILE: {}'.format(
                USER_PROFILE))

            # Get the experiment details for the current USER PROFILE
            experiment_details = self.audit_framework_youtube_homepage_col.find_one(
                {
                    'user_profile_type': USER_PROFILE,
                    'total_repetitions':
                    Config.AUDIT_HOMEPAGE_TOTAL_REPETITIONS
                }, {
                    'experiment_details': 1,
                    'experiment_analysis': 1
                })

            # Ensure that the results exists and the analysis has not already performed
            if not experiment_details:
                print(
                    '\n[{}] YouTube Homepage Audit Experiment results cannot be found in MongoDB'
                    .format(USER_PROFILE))
                return None
            if Utils.key_exists(experiment_details, 'experiment_analysis'):
                print(
                    '\n[{}] Incremental analysis for the current Homepage Experiment already performed'
                    .format(USER_PROFILE))
                continue

            # Declare variables
            incremental_homepage_experiment_analysis = list()
            progressBar = tqdm(total=Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD)
            for n_top_homepage_videos in range(
                    1, Config.AUDIT_HOMEPAGE_VIDEOS_THRESHOLD + 1):
                # Declare necessary variables for calculation
                pseudoscience_videos_found = list()
                all_videos_seen = list()

                # Iterate Experiment Repetitions
                for experiment_repetition in experiment_details[
                        'experiment_details']:
                    # Iterate videos of the current repetition
                    for video_id in experiment_repetition[
                            'CRAWLED_VIDEOS'][:n_top_homepage_videos]:
                        # Add to the list of seen videos
                        all_videos_seen.append(video_id)
                        # Get Video Label
                        curr_video_label = self.get_video_label(
                            video_id=video_id)
                        if curr_video_label == 'pseudoscience':
                            pseudoscience_videos_found.append(video_id)

                # Calculate analysis results for the current number of homepage videos
                homepage_experiment_analysis = dict()
                homepage_experiment_analysis['total_videos_seen'] = len(
                    all_videos_seen)
                homepage_experiment_analysis['total_unique_videos_seen'] = len(
                    list(set(all_videos_seen)))
                homepage_experiment_analysis[
                    'pseudoscience_videos_found'] = pseudoscience_videos_found
                homepage_experiment_analysis[
                    'total_pseudoscience_videos_found'] = len(
                        pseudoscience_videos_found)
                homepage_experiment_analysis[
                    'total_unique_pseudoscience_videos_found'] = len(
                        list(set(pseudoscience_videos_found)))
                homepage_experiment_analysis[
                    'average_pseudoscience_videos_total'] = (
                        len(pseudoscience_videos_found) /
                        len(all_videos_seen)) * 100
                homepage_experiment_analysis[
                    'average_pseudoscience_videos_unique'] = (
                        len(list(set(pseudoscience_videos_found))) /
                        len(list(set(all_videos_seen)))) * 100
                # Add to the list with all the results
                incremental_homepage_experiment_analysis.append(
                    homepage_experiment_analysis)

                progressBar.update(1)
            progressBar.close()
            """ Insert YouTube Homepage Audit Analysis results into MongoDB """
            self.audit_framework_youtube_homepage_col.update_one(
                {'user_profile_type': USER_PROFILE}, {
                    '$set': {
                        'experiment_analysis':
                        incremental_homepage_experiment_analysis
                    }
                })
        return
    def crawl_watch_youtube_video(self, video_id, hop_number):
        """
        Method that downloads the metadata of the given YouTube Video and watches the video
        :param video_id:
        :param hop_number:
        :return:
        """
        # Find whether we should watch the video or not
        if Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO and hop_number < 5:
            watch_curr_video = True
        else:
            watch_curr_video = False

        # Load YouTube Video Page
        self.driver.get('https://www.youtube.com/watch?v={}&autoplay=1'.format(video_id))
        time.sleep(3)

        # Check if user is Authenticated before proceeding
        if self.USER_PROFILE != 'NO_PERSONALIZATION' and not self.is_user_authenticated():
            exit(1)

        # Check if LiveStream
        try:
            isLivestream = self.wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string'))).text
            if 'started streaming' in isLivestream.lower():
                print('[VIDEO: {}] is a LIVESTREAM. Skipping and choosing another video...'.format(video_id))
                return None
        except TimeoutException:
            pass

        # Start by Watching the Video
        if watch_curr_video:
            try:
                self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[4]/button'))).click()
            except TimeoutException:
                try:
                    self.wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[1]/div/div/div/ytd-player/div/div/div[5]/button'))).click()
                except TimeoutException:
                    pass
        # Keep the time needed to crawl the video details
        video_crawl_started = time.time()
        self.driver.execute_script("window.scrollTo(0, 800)")

        """ DOWNLOAD VIDEO METADATA """
        # Check if Video already exists in MongoDB
        video_exists = False
        video_metadata = self.audit_framework_videos_col.find_one({'id': video_id})
        if not video_metadata:
            # Get Video Metadata using YouTube Data API
            video_metadata = self.get_video_metadata(video_id=video_id, retrieve_recommended_videos=False)
            if video_metadata is None:
                return None

            # Add Video Annotation information
            video_metadata['classification'] = dict()
            video_metadata['classification']['classification_category'] = None
        else:
            # Set Video Exists flag
            video_exists = True

        #
        # GET RELATED VIDEOS (no matter if the video exists or not)
        #
        print('--- [VIDEO: {}] GETTING TOP {} RECOMMENDED VIDEOS...'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD))
        related_videos_list = list()
        related_videos_items = self.driver.find_elements_by_xpath('//*[@id="thumbnail"]')
        included_related_videos = 0
        for related_video_item in related_videos_items:
            try:
                related_video_id = related_video_item.get_attribute('href').split('v=')[1]
                if "&" in related_video_id:
                    related_video_id = related_video_id.split('&')[0]
                related_videos_list.append(related_video_id)
                included_related_videos += 1
            except (AttributeError, IndexError):
                continue

            if Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD == len(related_videos_list):
                break
        print('--- [VIDEO: {0}] TOP {1} RECOMMENDED VIDEOS: {2}'.format(video_id, Config.AUDIT_RANDOM_WALKS_RECOMMENDED_VIDEOS_THRESHOLD, related_videos_list))
        video_metadata['relatedVideos'] = related_videos_list
        video_metadata['updatedAt'] = str(dt.now())

        # STORE VIDEO INFORMATION IN MongoDB
        if not video_exists:
            # Insert Video Details in MongoDB
            self.audit_framework_videos_col.insert_one(video_metadata)
        else:
            # Update Video Details in MongoDB
            self.audit_framework_videos_col.replace_one({'id': video_id}, video_metadata, upsert=True)

        # WATCH VIDEO
        if watch_curr_video:
            # Calculate Video Crawl Duration
            video_crawl_ended = time.time()
            video_crawl_duration_sec = video_crawl_ended - video_crawl_started

            # Read Video Duration
            video_duration_sec = Utils.convert_youtube_video_duration_to_seconds(video_duration=video_metadata['contentDetails']['duration'])
            # Calculate the final watch time percentage to watch
            final_video_duration_sec = int((video_duration_sec * Config.AUDIT_RANDOM_WALKS_WATCH_VIDEO_PERCENTAGE) / 100)
            final_video_duration_sec = final_video_duration_sec - video_crawl_duration_sec

            print('[{0}] - Sleeping for {1} seconds to watch the full VIDEO: {2}'.format(dt.now().strftime("%d-%m-%Y %H:%M:%S"), final_video_duration_sec, video_id))
            time.sleep(final_video_duration_sec)
        return video_metadata
    def analyze_audit_experiments(self, random_walks_starting_hop=0):
        """
        Method that analyzes the random walks of the YouTube Video Recommendations audit experiments
        :param random_walks_starting_hop:
        :return:
        """
        # Iterate each User Profile and calculate its plot values
        for USER_PROFILE in self.USER_PROFILES:
            print('\n--- Analyzing Random Walks for USER PROFILE: {}'.format(USER_PROFILE))

            # Iterate through the keywords for each User Profile
            progressBar = tqdm(total=len(self.CONSIDERED_SEARCH_TERMS))
            for SEARCH_TERM in self.CONSIDERED_SEARCH_TERMS:
                # Initialize Variables
                total_pseudoscience_videos_found = 0

                # Get Random Walk details for the current User Profile and Search Term
                curr_random_walk_details = self.audit_framework_youtube_video_recommendations.find_one({
                    '$and': [
                        {'user_profile_type': USER_PROFILE},
                        {'seed_search_term_topic': SEARCH_TERM}
                    ]
                }, {'random_walks_details': 1, 'random_walks_analysis': 1})

                # Ensure that we have performed Random Walks for the requested YouTube Recommendations Monitor Round ID
                if not curr_random_walk_details:
                    print('--- [{}] Personalized Random Walks for SEARCH TERM {} NOT PERFORMED'.format(USER_PROFILE, SEARCH_TERM))
                    return None
                if Utils.key_exists(curr_random_walk_details, 'random_walks_analysis'):
                    print('--- [{}] Analysis of Personalized Random Walks for SEARCH TERM {} ALREADY PERFORMED'.format(USER_PROFILE, SEARCH_TERM))
                    progressBar.update(1)
                    continue

                hops_pseudoscience_videos_found = [list() for i in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)]
                hops_all_videos_found = [list() for i in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)]

                # Iterate Random Walks
                for random_walk in curr_random_walk_details['random_walks_details']:

                    # Iterate each Random Walk and calculate what we want
                    for hop_cntr in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1):
                        hops_all_videos_found[hop_cntr].append(random_walk['hop_{}'.format(hop_cntr)]['video_id'])

                        # Get video label
                        curr_video_label = self.get_video_label(video_id=random_walk['hop_{}'.format(hop_cntr)]['video_id'])
                        if curr_video_label == 'pseudoscience':
                            hops_pseudoscience_videos_found[hop_cntr].append(random_walk['hop_{}'.format(hop_cntr)]['video_id'])
                            total_pseudoscience_videos_found += 1

                """ 
                Calculate the percentage of times our Random Walker has found a PSEUDOSCIENCE video at each Hop
                """
                hops_pseudoscience_videos_found_perc = [0.0 for j in range(0, Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1)]
                for hop_cntr in range(start=random_walks_starting_hop, stop=Config.AUDIT_RANDOM_WALKS_MAX_HOPS + 1):

                    # HOP 0
                    hop_percentage_pseudoscience = 0.0
                    if hop_cntr == 0:
                        hop_unique_total_videos = len(list(set(hops_all_videos_found[hop_cntr])))
                        hop_unique_total_pseudoscience = len(list(set(hops_pseudoscience_videos_found[hop_cntr])))
                        hop_percentage_pseudoscience = (int(hop_unique_total_pseudoscience) / float(hop_unique_total_videos)) * 100

                    # ALL OTHER HOPS
                    elif hop_cntr >= 1:
                        all_hops_pseudoscience_videos = list()
                        all_hops_videos = list()
                        for i in range(start=random_walks_starting_hop, stop=hop_cntr + 1):
                            all_hops_pseudoscience_videos += hops_pseudoscience_videos_found[i]
                            all_hops_videos += hops_all_videos_found[i]
                        hop_percentage_pseudoscience = (int(len(list(set(all_hops_pseudoscience_videos)))) / float(len(list(set(all_hops_videos))))) * 100

                    # Set the Percentage of Pseudoscience videos found at the current Hop over all unique videos so far in the Walk
                    hops_pseudoscience_videos_found_perc[hop_cntr] = hop_percentage_pseudoscience

                """ Insert YouTube Video Recommendations Audit (Random Walks) Analysis results into MongoDB """
                random_walks_analysis_results = dict()
                random_walks_analysis_results['total_pseudoscience_videos_found'] = total_pseudoscience_videos_found
                random_walks_analysis_results['hops_pseudoscience_videos_found'] = hops_pseudoscience_videos_found
                random_walks_analysis_results['hops_pseudoscience_videos_found_perc'] = hops_pseudoscience_videos_found_perc

                # Update Database Record for the Random Walks of the current USER PROFILE - SEARCH TERM
                self.audit_framework_youtube_video_recommendations.update_one(
                    {'$and': [{'user_profile_type': USER_PROFILE}, {'seed_search_term_topic': SEARCH_TERM}]},
                    {'$set': {'random_walks_analysis': random_walks_analysis_results}}
                )

                progressBar.update(1)
            progressBar.close()
        return