def test_panda3d_screenshots_show_keep_shot_name(self): # We found that some of our experimental data includes slashes in the name # of the shot, so we replicate that here. standardized = standardize_url( "http://panda3d.org/showss.php?shot=path/to/photo&otherparam=1") self.assertEqual(standardized, "panda3d.org/showss.php?shot=path/to/photo")
def test_panda3d_screenshots_show_keep_shot_name(self): # We found that some of our experimental data includes slashes in the name # of the shot, so we replicate that here. standardized = standardize_url( "http://panda3d.org/showss.php?shot=path/to/photo&otherparam=1" ) self.assertEqual(standardized, "panda3d.org/showss.php?shot=path/to/photo")
def compute_navigation_ngrams(length, page_type_lookup): ''' Compute n-grams of sequences of pages visited, of a certain length. A `page_type_lookup` dictionary must be provided, that maps URLs to their page types. ''' # Create a new index for this computation last_compute_index = NavigationNgram.select(fn.Max(NavigationNgram.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where(LocationVisit.compute_index == visit_compute_index) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) concern_indexes = set([visit.concern_index for visit in visits]) # Go through every concern for every participant. For each page they visit, # increment the visits to a vertex. For each transition from one page to the next, # increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) # Create a list of unique URLs that each participant visited urls = [visit.url for visit in participant_concern_visits] standardized_urls = [standardize_url(url) for url in urls] # Create a list of all page types visited. # If this is a redirect, then skip it. For all intents and purposes, # someone is traveling between two the page type before and after it. page_types = [] for url in standardized_urls: if url in page_type_lookup: url_info = page_type_lookup[url] if not url_info['redirect']: page_types.append(url_info['main_type']) else: logger.warn("URL %s not in page type lookup. Giving it 'Unknown' type", url) page_types.append("Unknown") # Compute n-grams using NLTK command ngrams = nltk_compute_ngrams(page_types, length) # Save each n-gram to the database for ngram_tuple in ngrams: NavigationNgram.create( compute_index=compute_index, user_id=participant_id, concern_index=concern_index, length=length, ngram=", ".join(ngram_tuple), )
def test_experiment_site_is_reduced_to_its_domain(self): standardized = standardize_url("http://searchlogger.tutorons.com/long/path#with-fragment") self.assertEqual(standardized, "searchlogger.tutorons.com")
def test_yahoo_redirect_link_reduced_to_standard_path(self): standardized = standardize_url("http://r.search.yahoo.com/_ylt=AwrTccsomething") self.assertEqual(standardized, "r.search.yahoo.com/_ylt=redirect")
def test_google_groups_forums_keep_fragment(self): standardized = standardize_url("http://groups.google.com/forum/#!forum/keras-users") self.assertEqual(standardized, "groups.google.com/forum/!forum/keras-users")
def test_tigsource_forums_keep_board_id(self): standardized = standardize_url("http://forums.tigsource.com/index.php?board=20.0") self.assertEqual(standardized, "forums.tigsource.com/index.php?board=20.0")
def test_panda3d_forums_keep_forum_id(self): standardized = standardize_url("http://panda3d.org/viewforum.php?f=1") self.assertEqual(standardized, "panda3d.org/viewforum.php?f=1")
def test_youtube_videos_preserve_v_parameter(self): standardized = standardize_url("http://youtube.com/watch?v=DRR9fOXkfRE") self.assertEqual(standardized, "youtube.com/watch?v=DRR9fOXkfRE")
def compute_unique_urls(page_type_lookup, exclude_users=None): exclude_users = [] if exclude_users is None else exclude_users # Create a new index for this computation last_compute_index = UniqueUrl.select(fn.Max( UniqueUrl.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index, LocationVisit.user_id.not_in(exclude_users), ) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) # Go through every concern for every participant. Find the number of URLs # they visited that no one else visited. for participant_id in participant_ids: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id) others_visits = visits.where(LocationVisit.user_id != participant_id) # Create a list of unique URLs that this participant visited participant_urls = [visit.url for visit in participant_concern_visits] participant_standardized_urls = [ standardize_url(url) for url in participant_urls ] # Create a list of unique URLs that all others visited others_urls = [visit.url for visit in others_visits] others_standardized_urls = [ standardize_url(url) for url in others_urls ] # Compute the URLs that this participant visited uniquely, and that they share with others unique_participant_urls = set(participant_standardized_urls) - set( others_standardized_urls) shared_participant_urls = set(participant_standardized_urls) - set( unique_participant_urls) # Save all URLs that the participant visited to the database, including # whether they visited them uniquely. for url in unique_participant_urls: UniqueUrl.create( compute_index=compute_index, user_id=participant_id, url=url, unique=True, ) for url in shared_participant_urls: UniqueUrl.create( compute_index=compute_index, user_id=participant_id, url=url, unique=False, )
def test_by_default_standard_url_is_domain_and_path(self): standardized = standardize_url("http://site.com/path#fragment?q=query") self.assertEqual(standardized, "site.com/path")
def compute_navigation_ngrams(length, page_type_lookup): ''' Compute n-grams of sequences of pages visited, of a certain length. A `page_type_lookup` dictionary must be provided, that maps URLs to their page types. ''' # Create a new index for this computation last_compute_index = NavigationNgram.select( fn.Max(NavigationNgram.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index) # Get the distinct participant IDs and concern indexes participant_ids = set([visit.user_id for visit in visits]) concern_indexes = set([visit.concern_index for visit in visits]) # Go through every concern for every participant. For each page they visit, # increment the visits to a vertex. For each transition from one page to the next, # increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) # Create a list of unique URLs that each participant visited urls = [visit.url for visit in participant_concern_visits] standardized_urls = [standardize_url(url) for url in urls] # Create a list of all page types visited. # If this is a redirect, then skip it. For all intents and purposes, # someone is traveling between two the page type before and after it. page_types = [] for url in standardized_urls: if url in page_type_lookup: url_info = page_type_lookup[url] if not url_info['redirect']: page_types.append(url_info['main_type']) else: logger.warn( "URL %s not in page type lookup. Giving it 'Unknown' type", url) page_types.append("Unknown") # Compute n-grams using NLTK command ngrams = nltk_compute_ngrams(page_types, length) # Save each n-gram to the database for ngram_tuple in ngrams: NavigationNgram.create( compute_index=compute_index, user_id=participant_id, concern_index=concern_index, length=length, ngram=", ".join(ngram_tuple), )
def test_youtube_videos_preserve_v_parameter(self): standardized = standardize_url( "http://youtube.com/watch?v=DRR9fOXkfRE") self.assertEqual(standardized, "youtube.com/watch?v=DRR9fOXkfRE")
def test_panda3d_forums_keep_forum_id(self): standardized = standardize_url("http://panda3d.org/viewforum.php?f=1") self.assertEqual(standardized, "panda3d.org/viewforum.php?f=1")
def test_panda3d_topics_keep_topic_id(self): standardized = standardize_url( "http://panda3d.org/viewtopic.php?f=1&t=2") self.assertEqual(standardized, "panda3d.org/viewtopic.php?t=2")
def test_standardization_removes_www_prefix_and_schema(self): standardized = standardize_url("http://www.site.com") self.assertEqual(standardized, "site.com")
def test_pages_viewed_as_source_gets_view_source_prepended(self): standardized = standardize_url("view-source:http://site.com") self.assertEqual(standardized, "view-source:site.com")
def test_standardization_removes_www_prefix_and_schema(self): standardized = standardize_url("http://www.site.com") self.assertEqual(standardized, "site.com")
def test_google_groups_topic_keep_fragment(self): standardized = standardize_url( "http://groups.google.com/forum/#!topic/keras-users/epFdzcxl8Gg") self.assertEqual( standardized, "groups.google.com/forum/!topic/keras-users/epFdzcxl8Gg")
def test_bluejeans_site_standardizes_to_bluejeans_domain(self): standardized = standardize_url( "http://bluejeans.com/long-path#some-fragment") self.assertEqual(standardized, "bluejeans.com")
def test_by_default_standard_url_is_domain_and_path(self): standardized = standardize_url("http://site.com/path#fragment?q=query") self.assertEqual(standardized, "site.com/path")
def test_google_groups_forums_keep_fragment(self): standardized = standardize_url( "http://groups.google.com/forum/#!forum/keras-users") self.assertEqual(standardized, "groups.google.com/forum/!forum/keras-users")
def test_panda3d_topics_keep_topic_id(self): standardized = standardize_url("http://panda3d.org/viewtopic.php?f=1&t=2") self.assertEqual(standardized, "panda3d.org/viewtopic.php?t=2")
def test_google_groups_simplify_search_url(self): standardized = standardize_url( "http://groups.google.com/forum/#!searchin/keras-users/model") self.assertEqual(standardized, "groups.google.com/forum/!searchin")
def test_tigsource_forums_keep_board_id(self): standardized = standardize_url( "http://forums.tigsource.com/index.php?board=20.0") self.assertEqual(standardized, "forums.tigsource.com/index.php?board=20.0")
def test_tigsource_forums_preserve_search_query_parameter(self): standardized = standardize_url( "http://forums.tigsource.com/index.php?action=search2;params=PARAMS" ) self.assertEqual(standardized, "forums.tigsource.com/index.php?action=search2")
def test_google_groups_topic_keep_fragment(self): standardized = standardize_url( "http://groups.google.com/forum/#!topic/keras-users/epFdzcxl8Gg" ) self.assertEqual(standardized, "groups.google.com/forum/!topic/keras-users/epFdzcxl8Gg")
def test_experiment_site_is_reduced_to_its_domain(self): standardized = standardize_url( "http://searchlogger.tutorons.com/long/path#with-fragment") self.assertEqual(standardized, "searchlogger.tutorons.com")
def test_google_groups_simplify_search_url(self): standardized = standardize_url( "http://groups.google.com/forum/#!searchin/keras-users/model" ) self.assertEqual(standardized, "groups.google.com/forum/!searchin")
def test_browser_pages_are_reduced_to_the_words_browser_page(self): standardized = standardize_url("about:preferences") self.assertEqual(standardized, "browser_page")
def test_tigsource_forums_preserve_search_query_parameter(self): standardized = standardize_url( "http://forums.tigsource.com/index.php?action=search2;params=PARAMS" ) self.assertEqual(standardized, "forums.tigsource.com/index.php?action=search2")
def test_yahoo_redirect_link_reduced_to_standard_path(self): standardized = standardize_url( "http://r.search.yahoo.com/_ylt=AwrTccsomething") self.assertEqual(standardized, "r.search.yahoo.com/_ylt=redirect")
def test_browser_pages_are_reduced_to_the_words_browser_page(self): standardized = standardize_url("about:preferences") self.assertEqual(standardized, "browser_page")
def test_stack_overflow_question_reduce_to_id_as_path(self): standardized = standardize_url( "http://stackoverflow.com/questions/1000/long-name") self.assertEqual(standardized, "stackoverflow.com/questions/1000")
def test_stack_overflow_question_reduce_to_id_as_path(self): standardized = standardize_url("http://stackoverflow.com/questions/1000/long-name") self.assertEqual(standardized, "stackoverflow.com/questions/1000")
def test_pages_viewed_as_source_gets_view_source_prepended(self): standardized = standardize_url("view-source:http://site.com") self.assertEqual(standardized, "view-source:site.com")
def test_bluejeans_site_standardizes_to_bluejeans_domain(self): standardized = standardize_url("http://bluejeans.com/long-path#some-fragment") self.assertEqual(standardized, "bluejeans.com")
def compute_navigation_graph(page_type_lookup, exclude_users=None, show_progress=False, concern_index=None): exclude_users = [] if exclude_users is None else exclude_users # Create a new index for this computation last_compute_index = NavigationVertex.select( fn.Max(NavigationVertex.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Fetch the set of visits for the most recently computed visits visit_compute_index = LocationVisit.select( fn.Max(LocationVisit.compute_index)).scalar() visits = LocationVisit.select().where( LocationVisit.compute_index == visit_compute_index) # If the user has provided a concern index that they want to compute the graph for, # then restrict navigation data to only that concern if concern_index is not None: visits = visits.where(LocationVisit.concern_index == concern_index) # Get the distinct participant IDs and concern indexes # Exclude any users that were not requested as part of the analysis participant_ids = set([ visit.user_id for visit in visits if visit.user_id not in exclude_users ]) concern_indexes = set([visit.concern_index for visit in visits]) # Set up progress bar. total_iterations_count = len(participant_ids) * len(concern_indexes) if show_progress: progress_bar = ProgressBar( maxval=total_iterations_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Read ', Counter(), ' / ' + str(total_iterations_count) + ' sessions.' ]) progress_bar.start() # The list of vertices needs to be populated with a start and end node. # All navigation behavior starts at the "Start" node, and ends at the "End" node vertices = { "Start": Vertex("Start", occurrences=1), "End": Vertex("End", occurrences=1), } edges = {} last_vertex = vertices["Start"] iterations_count = 0 # Go through every concern for every participant. For each page they visit, # increment the visits to the corresponding vertex. For each transition from one # page to the next, increment the occurrence of a transition between two page types. for participant_id in participant_ids: for concern_index in concern_indexes: participant_concern_visits = visits.where( LocationVisit.user_id == participant_id, LocationVisit.concern_index == concern_index, ).order_by(LocationVisit.start.asc()) for visit in participant_concern_visits: # Get the type of the page visited standardized_url = standardize_url(visit.url) if standardized_url in page_type_lookup: url_info = page_type_lookup[standardized_url] page_type = url_info['main_type'] # If this is a redirect, then just skip it. It's more important # to link the URL before it to the link the redirect points to. if url_info['redirect']: continue else: logger.warn( "URL %s not in page type lookup. Giving it 'Unknown' type", standardized_url) page_type = "Unknown" # Add a new vertex for this page type if it doesn't exist if page_type not in vertices: vertices[page_type] = Vertex(page_type) # Save that we have seen this page type one more time vertex = vertices[page_type] vertex.occurrences += 1 # Add the time spent to the total time spent for this page type time_passed = visit.end - visit.start seconds = time_passed.seconds + (time_passed.microseconds / float(1000000)) vertex.total_time += seconds # Connect an edge between the last page visited and this one if (last_vertex.page_type, vertex.page_type) not in edges: edges[(last_vertex.page_type, vertex.page_type)] = Edge(last_vertex, vertex) edge = edges[(last_vertex.page_type, vertex.page_type)] edge.occurrences += 1 # Redefine the last page so we know in the next iteration what was just visited. last_vertex = vertex # After each participant or each concern, connect from the last URL to the end vertex end_vertex = vertices['End'] if (last_vertex.page_type, end_vertex.page_type) not in edges: edges[(last_vertex.page_type, end_vertex.page_type)] = Edge(last_vertex, end_vertex) edge = edges[(last_vertex.page_type, end_vertex.page_type)] edge.occurrences += 1 # After each participant or each concern, we reset the last_page_type to "Start" last_vertex = vertices['Start'] if show_progress: iterations_count += 1 progress_bar.update(iterations_count) # Compute the mean time spent on each vertex for vertex in vertices.values(): vertex.mean_time = vertex.total_time / float(vertex.occurrences) # Compute the transition probability for each edge leaving a vertex. # First, group all edges by their source vertex get_source_page_type = lambda (source_type, target_type): source_type sorted_edge_keys = sorted(edges.keys(), key=get_source_page_type) edge_groups = itertools.groupby(sorted_edge_keys, get_source_page_type) for _, edge_group in edge_groups: # Fetch those edges in the current group # (Thos in the current group share the same source.) edge_keys = [_ for _ in edge_group] group_edges = dict(filter(lambda (k, v): k in edge_keys, edges.items())) # Compute the probability of each edge being taken total_occurrences = sum([e.occurrences for e in group_edges.values()]) for edge in group_edges.values(): edge.probability = float(edge.occurrences) / total_occurrences # Save all vertices to the database vertex_models = {} for vertex in vertices.values(): vertex_model = NavigationVertex.create( compute_index=compute_index, page_type=vertex.page_type, occurrences=vertex.occurrences, total_time=vertex.total_time, mean_time=vertex.mean_time, ) # We store a dictionary from page type to vertex model so # we can look up these models when saving the edges. vertex_models[vertex.page_type] = vertex_model # Save all edges to the database # We use a progress bar for this as there might be a lot of edges and # we upload each of them separately to the database. if show_progress: progress_bar = ProgressBar(maxval=len(edges), widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Updated graph with ', Counter(), ' / ' + str(len(edges)) + ' edges.' ]) progress_bar.start() for edge_index, edge in enumerate(edges.values(), start=1): NavigationEdge.create( compute_index=compute_index, source_vertex=vertex_models[edge.source_vertex.page_type], target_vertex=vertex_models[edge.target_vertex.page_type], occurrences=edge.occurrences, probability=edge.probability, ) if show_progress: progress_bar.update(edge_index) if show_progress: progress_bar.finish() if show_progress: progress_bar.finish()