def test_graph_computation_uses_only_latest_computed_visits(self):

        create_location_visit(
            compute_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            compute_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            compute_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 3)
        edges = NavigationEdge.select()
        transition_list = [(e.source_vertex.page_type, e.target_vertex.page_type) for e in edges]
        self.assertIn(("page_type_2", "page_type_2"), transition_list)
    def test_filter_to_only_one_concern_if_concern_index_provided(self):

        # This event should be ignored
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        # This event should be captured
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(concern_index=1, page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 3)
        edges = NavigationEdge.select()
        transition_list = [(e.source_vertex.page_type, e.target_vertex.page_type) for e in edges]
        self.assertIn(("page_type_2", "page_type_2"), transition_list)
示例#3
0
    def test_graph_computation_uses_only_latest_computed_visits(self):

        create_location_visit(
            compute_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            compute_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            compute_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 3)
        edges = NavigationEdge.select()
        transition_list = [(e.source_vertex.page_type,
                            e.target_vertex.page_type) for e in edges]
        self.assertIn(("page_type_2", "page_type_2"), transition_list)
    def test_edge_added_between_all_consecutive_visits(self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()

        # There will be 4 edges:
        # * 2 for the transitions between the 3 URLs above
        # * 1 for the transition from "Start" to the first URL
        # * 1 for the transition from the last URL to "End"
        self.assertEqual(edges.count(), 4)

        edge_page_type_pairs = [
            (edge.source_vertex.page_type, edge.target_vertex.page_type)
            for edge in edges
        ]
        self.assertIn(("Start", "page_type_1"), edge_page_type_pairs)
        self.assertIn(("page_type_1", "page_type_1"), edge_page_type_pairs)
        self.assertIn(("page_type_1", "page_type_2"), edge_page_type_pairs)
        self.assertIn(("page_type_2", "End"), edge_page_type_pairs)
示例#5
0
    def test_include_all_concerns_if_no_concern_index_provided(self):

        # Both events should be captured
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 6)
    def test_include_all_concerns_if_no_concern_index_provided(self):

        # Both events should be captured
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 6)
示例#7
0
    def test_edge_occurrences_counts_number_of_transitions_between_page_types(
            self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()
        edge_dict = {(edge.source_vertex.page_type,
                      edge.target_vertex.page_type): edge
                     for edge in edges}
        self.assertEqual(edge_dict[('Start', 'page_type_1')].occurrences, 1)
        self.assertEqual(edge_dict[('page_type_1', 'page_type_1')].occurrences,
                         2)
        self.assertEqual(edge_dict[('page_type_1', 'page_type_2')].occurrences,
                         1)
        self.assertEqual(edge_dict[('page_type_2', 'End')].occurrences, 1)
示例#8
0
    def test_edge_added_between_all_consecutive_visits(self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()

        # There will be 4 edges:
        # * 2 for the transitions between the 3 URLs above
        # * 1 for the transition from "Start" to the first URL
        # * 1 for the transition from the last URL to "End"
        self.assertEqual(edges.count(), 4)

        edge_page_type_pairs = [(edge.source_vertex.page_type,
                                 edge.target_vertex.page_type)
                                for edge in edges]
        self.assertIn(("Start", "page_type_1"), edge_page_type_pairs)
        self.assertIn(("page_type_1", "page_type_1"), edge_page_type_pairs)
        self.assertIn(("page_type_1", "page_type_2"), edge_page_type_pairs)
        self.assertIn(("page_type_2", "End"), edge_page_type_pairs)
    def test_edge_occurrences_counts_number_of_transitions_between_page_types(self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()
        edge_dict = {
            (edge.source_vertex.page_type, edge.target_vertex.page_type): edge
            for edge in edges
        }
        self.assertEqual(edge_dict[('Start', 'page_type_1')].occurrences, 1)
        self.assertEqual(edge_dict[('page_type_1', 'page_type_1')].occurrences, 2)
        self.assertEqual(edge_dict[('page_type_1', 'page_type_2')].occurrences, 1)
        self.assertEqual(edge_dict[('page_type_2', 'End')].occurrences, 1)
示例#10
0
    def test_filter_to_only_one_concern_if_concern_index_provided(self):

        # This event should be ignored
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        # This event should be captured
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )

        compute_navigation_graph(concern_index=1,
                                 page_type_lookup=PAGE_TYPE_LOOKUP)
        self.assertEqual(NavigationEdge.select().count(), 3)
        edges = NavigationEdge.select()
        transition_list = [(e.source_vertex.page_type,
                            e.target_vertex.page_type) for e in edges]
        self.assertIn(("page_type_2", "page_type_2"), transition_list)
示例#11
0
    def test_graph_skips_redirects(self):

        # Because redirects typically don't show any content but are just a gateway to
        # another page, we will leave them out of the graph of navigation.  It's more
        # meaningful to connect the link before it, and the link that it points to.
        create_location_visit(
            url="redirect",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)

        # There should only be one edge---from "Start" to "End"
        self.assertEqual(NavigationEdge.select().count(), 1)
    def test_graph_skips_redirects(self):

        # Because redirects typically don't show any content but are just a gateway to
        # another page, we will leave them out of the graph of navigation.  It's more
        # meaningful to connect the link before it, and the link that it points to.
        create_location_visit(
            url="redirect",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)

        # There should only be one edge---from "Start" to "End"
        self.assertEqual(NavigationEdge.select().count(), 1)
示例#13
0
    def test_edge_transition_probabilities_normalize_occurrences(self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 9, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 10, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()
        edge_dict = {(edge.source_vertex.page_type,
                      edge.target_vertex.page_type): edge
                     for edge in edges}
        self.assertAlmostEqual(
            edge_dict[('page_type_1', 'page_type_1')].probability,
            float(1) / 2)
        self.assertAlmostEqual(
            edge_dict[('page_type_1', 'page_type_2')].probability,
            float(1) / 4)
        self.assertAlmostEqual(edge_dict[('page_type_1', 'End')].probability,
                               float(1) / 4)
        self.assertAlmostEqual(
            edge_dict[('page_type_2', 'page_type_1')].probability, 1)
示例#14
0
    def test_edge_not_added_between_concerns_for_the_same_participant(self):

        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)

        # 4 edges should have been created---between the Start vertex, the one URL, and
        # the End vertex for each of the concerns
        self.assertEqual(NavigationEdge.select().count(), 4)
    def test_edge_not_added_between_concerns_for_the_same_participant(self):

        create_location_visit(
            concern_index=0,
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            concern_index=1,
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)

        # 4 edges should have been created---between the Start vertex, the one URL, and
        # the End vertex for each of the concerns
        self.assertEqual(NavigationEdge.select().count(), 4)
    def test_edge_transition_probabilities_normalize_occurrences(self):

        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 1, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 2, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 3, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 4, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 5, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 6, 0),
        )
        create_location_visit(
            url="page2",
            start=datetime.datetime(2000, 1, 1, 12, 0, 7, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 8, 0),
        )
        create_location_visit(
            url="page1",
            start=datetime.datetime(2000, 1, 1, 12, 0, 9, 0),
            end=datetime.datetime(2000, 1, 1, 12, 0, 10, 0),
        )

        compute_navigation_graph(page_type_lookup=PAGE_TYPE_LOOKUP)
        edges = NavigationEdge.select()
        edge_dict = {
            (edge.source_vertex.page_type, edge.target_vertex.page_type): edge
            for edge in edges
        }
        self.assertAlmostEqual(edge_dict[('page_type_1', 'page_type_1')].probability, float(1) / 2)
        self.assertAlmostEqual(edge_dict[('page_type_1', 'page_type_2')].probability, float(1) / 4)
        self.assertAlmostEqual(edge_dict[('page_type_1', 'End')].probability, float(1) / 4)
        self.assertAlmostEqual(edge_dict[('page_type_2', 'page_type_1')].probability, 1)
def compute_navigation_graph(page_type_lookup,
                             exclude_users=None,
                             show_progress=False,
                             concern_index=None):

    exclude_users = [] if exclude_users is None else exclude_users

    # Create a new index for this computation
    last_compute_index = NavigationVertex.select(
        fn.Max(NavigationVertex.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index)

    # If the user has provided a concern index that they want to compute the graph for,
    # then restrict navigation data to only that concern
    if concern_index is not None:
        visits = visits.where(LocationVisit.concern_index == concern_index)

    # Get the distinct participant IDs and concern indexes
    # Exclude any users that were not requested as part of the analysis
    participant_ids = set([
        visit.user_id for visit in visits if visit.user_id not in exclude_users
    ])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Set up progress bar.
    total_iterations_count = len(participant_ids) * len(concern_indexes)
    if show_progress:
        progress_bar = ProgressBar(
            maxval=total_iterations_count,
            widgets=[
                'Progress: ',
                Percentage(), ' ',
                Bar(marker=RotatingMarker()), ' ',
                ETA(), ' Read ',
                Counter(), ' / ' + str(total_iterations_count) + ' sessions.'
            ])
        progress_bar.start()

    # The list of vertices needs to be populated with a start and end node.
    # All navigation behavior starts at the "Start" node, and ends at the "End" node
    vertices = {
        "Start": Vertex("Start", occurrences=1),
        "End": Vertex("End", occurrences=1),
    }
    edges = {}
    last_vertex = vertices["Start"]
    iterations_count = 0

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to the corresponding vertex.  For each transition from one
    # page to the next, increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            for visit in participant_concern_visits:

                # Get the type of the page visited
                standardized_url = standardize_url(visit.url)
                if standardized_url in page_type_lookup:
                    url_info = page_type_lookup[standardized_url]
                    page_type = url_info['main_type']
                    # If this is a redirect, then just skip it.  It's more important
                    # to link the URL before it to the link the redirect points to.
                    if url_info['redirect']:
                        continue
                else:
                    logger.warn(
                        "URL %s not in page type lookup.  Giving it 'Unknown' type",
                        standardized_url)
                    page_type = "Unknown"

                # Add a new vertex for this page type if it doesn't exist
                if page_type not in vertices:
                    vertices[page_type] = Vertex(page_type)

                # Save that we have seen this page type one more time
                vertex = vertices[page_type]
                vertex.occurrences += 1

                # Add the time spent to the total time spent for this page type
                time_passed = visit.end - visit.start
                seconds = time_passed.seconds + (time_passed.microseconds /
                                                 float(1000000))
                vertex.total_time += seconds

                # Connect an edge between the last page visited and this one
                if (last_vertex.page_type, vertex.page_type) not in edges:
                    edges[(last_vertex.page_type,
                           vertex.page_type)] = Edge(last_vertex, vertex)
                edge = edges[(last_vertex.page_type, vertex.page_type)]
                edge.occurrences += 1

                # Redefine the last page so we know in the next iteration what was just visited.
                last_vertex = vertex

            # After each participant or each concern, connect from the last URL to the end vertex
            end_vertex = vertices['End']
            if (last_vertex.page_type, end_vertex.page_type) not in edges:
                edges[(last_vertex.page_type,
                       end_vertex.page_type)] = Edge(last_vertex, end_vertex)
            edge = edges[(last_vertex.page_type, end_vertex.page_type)]
            edge.occurrences += 1

            # After each participant or each concern, we reset the last_page_type to "Start"
            last_vertex = vertices['Start']

            if show_progress:
                iterations_count += 1
                progress_bar.update(iterations_count)

    # Compute the mean time spent on each vertex
    for vertex in vertices.values():
        vertex.mean_time = vertex.total_time / float(vertex.occurrences)

    # Compute the transition probability for each edge leaving a vertex.
    # First, group all edges by their source vertex
    get_source_page_type = lambda (source_type, target_type): source_type
    sorted_edge_keys = sorted(edges.keys(), key=get_source_page_type)
    edge_groups = itertools.groupby(sorted_edge_keys, get_source_page_type)

    for _, edge_group in edge_groups:

        # Fetch those edges in the current group
        # (Thos in the current group share the same source.)
        edge_keys = [_ for _ in edge_group]
        group_edges = dict(filter(lambda (k, v): k in edge_keys,
                                  edges.items()))

        # Compute the probability of each edge being taken
        total_occurrences = sum([e.occurrences for e in group_edges.values()])
        for edge in group_edges.values():
            edge.probability = float(edge.occurrences) / total_occurrences

    # Save all vertices to the database
    vertex_models = {}
    for vertex in vertices.values():
        vertex_model = NavigationVertex.create(
            compute_index=compute_index,
            page_type=vertex.page_type,
            occurrences=vertex.occurrences,
            total_time=vertex.total_time,
            mean_time=vertex.mean_time,
        )
        # We store a dictionary from page type to vertex model so
        # we can look up these models when saving the edges.
        vertex_models[vertex.page_type] = vertex_model

    # Save all edges to the database
    # We use a progress bar for this as there might be a lot of edges and
    # we upload each of them separately to the database.
    if show_progress:
        progress_bar = ProgressBar(maxval=len(edges),
                                   widgets=[
                                       'Progress: ',
                                       Percentage(), ' ',
                                       Bar(marker=RotatingMarker()), ' ',
                                       ETA(), ' Updated graph with ',
                                       Counter(),
                                       ' / ' + str(len(edges)) + ' edges.'
                                   ])
        progress_bar.start()

    for edge_index, edge in enumerate(edges.values(), start=1):
        NavigationEdge.create(
            compute_index=compute_index,
            source_vertex=vertex_models[edge.source_vertex.page_type],
            target_vertex=vertex_models[edge.target_vertex.page_type],
            occurrences=edge.occurrences,
            probability=edge.probability,
        )
        if show_progress:
            progress_bar.update(edge_index)

    if show_progress:
        progress_bar.finish()

    if show_progress:
        progress_bar.finish()
示例#18
0
def main(compute_index, output_format, *args, **kwargs):

    # Attempt to import graph_tool, and share a helpful debugging message if it's not found.
    try:
        import graph_tool.all as gt
    except ImportError as e:
        print str(e)
        print '\n'.join([
            "",
            "ERROR: The \"graph_tool\" module could not be imported.",
            "Install the package and then point to it with PYTHONPATH.",
            "",
            "Details: graph-tool isn't required for most scripts in this repository.",
            "But it's needed to draw graphs in *this* script.  To download this",
            "package, see the download instructions on the graph-tool website:",
            "",
            "https://graph-tool.skewed.de/download",
            ""
            "Note: it's not enough to install \"graph_tool\" through pip.",
            "It relies on C++ libraries for accelerated graph routines.",
            "You'll have to use your system package manager or compile from scratch.",
            "",
        ])
        raise SystemExit

    # This is the graph that we'll construct
    graph = gt.Graph()

    # These data structures hold links to the vertices, edges, and their properties
    vertices = {}
    vertex_page_types = []
    vertex_total_times = []
    vertex_mean_times = []
    vertex_occurrences = []
    edge_occurrences = []
    edge_probabilities = []

    # Fetch the set of graph data from the round of computation that the caller wants,
    # or from the most recent graph if a version hasn't been provided.
    # Note that the compute_index should be the same for the vertex and edge data, so we
    # look it up using the same index.
    if compute_index is None:
        compute_index = NavigationVertex.select(fn.Max(NavigationVertex.compute_index)).scalar()
    vertex_models = NavigationVertex.select().where(NavigationVertex.compute_index == compute_index)
    edge_models = NavigationEdge.select().where(NavigationEdge.compute_index == compute_index)

    # Add vertices to graph and save vertex properties
    for vertex_model in vertex_models:

        # Add a vertex to the graph and save its properties
        vertex = graph.add_vertex()
        vertices[vertex_model.id] = vertex
        vertex_page_types.append(vertex_model.page_type)
        vertex_total_times.append(vertex_model.total_time)
        vertex_mean_times.append(vertex_model.mean_time)
        vertex_occurrences.append(vertex_model.occurrences)

    # Add edges to the graph and save their properties
    for edge_model in edge_models:
        graph.add_edge(
            # We look up vertices using the '_vertex_id' properties because this is already
            # retrieved in the fetched rows.  Note that if we want to look it up by
            # page type, this will require two extra queries to the database (one for
            # each vertex) for each edge added, which is very costly.
            vertices[edge_model.source_vertex_id],
            vertices[edge_model.target_vertex_id],
        )
        edge_occurrences.append(edge_model.occurrences)
        edge_probabilities.append(edge_model.probability)

    # Fix the positions and colors of the first and final vertices
    vertex_positions = []
    vertex_pins = []
    vertex_colors = []
    for page_type in vertex_page_types:
        if page_type == 'Start':
            vertex_positions.append([0.5, 3])
            vertex_pins.append(True)
            vertex_colors.append("#b2f3ba")  # light green
        elif page_type == 'End':
            vertex_positions.append([9.5, 3])
            vertex_pins.append(True)
            vertex_colors.append("#f3a4a7")  # light red
        else:
            vertex_positions.append([5, 3])
            vertex_pins.append(False)
            vertex_colors.append("white")
    vertex_position_property =\
        graph.new_vertex_property(str("vector<double>"), vals=vertex_positions)
    vertex_pin_property = graph.new_vertex_property(str("boolean"), vals=vertex_pins)
    vertex_color_property = graph.new_vertex_property(str("string"), vals=vertex_colors)

    # Because we're using unicode literals, each of the "value types" need to be coerced
    # to a string explicitly before creating new properties.
    # When making labels, we take advantage of the fact that most page types only have one
    # space, and usually they should be split into two new lines if they have a space.
    split_page_type_names = [_.replace(' ', '\n') for _ in vertex_page_types]
    vertex_labels = graph.new_vertex_property(str("string"), vals=split_page_type_names)

    # Determine vertex size based on frequently they have occurred.
    # While larger size means more visits, the relationship isn't linear.
    # The "log" is necessary to make sure that the difference isn't too severe between vertices.
    # This was hand-tailored to just look good.
    # vertex_occurrences_array = np.array(vertex_occurrences)
    # vertex_size_array = np.log((vertex_occurrences_array * float(10)) / np.max(vertex_occurrences))  # noqa
    # small_vertex_indexes = vertex_size_array < MINIMUM_VERTEX_SIZE
    # vertex_size_array[small_vertex_indexes] = MINIMUM_VERTEX_SIZE
    # vertex_sizes = graph.new_vertex_property(str("float"), vals=vertex_size_array)

    # Compute the font sizes to scale with vertex size.
    # This was hand-tailored to just look good too.
    # font_size_array = vertex_size_array * 10
    # small_font_indexes = font_size_array < MINIMUM_FONT_SIZE
    # font_size_array[small_font_indexes] = MINIMUM_FONT_SIZE
    # vertex_font_sizes = graph.new_vertex_property(str("double"), vals=font_size_array)

    # Edge label is determined by the probability that it is taken
    edge_labels = graph.new_edge_property(str("float"), vals=np.round(edge_probabilities, 2))

    # Edge thickness is determined by how likely a participant was to follow that transition
    edge_widths = graph.new_edge_property(
        str("float"),
        vals=[p * EDGE_PEN_WIDTH_PER_PROBABILITY for p in edge_probabilities],
    )

    # Show only the top most frequently visited page types
    vertex_occurrences_array = np.array(vertex_occurrences)
    is_vertex_frequent = vertex_occurrences_array >=\
        np.percentile(vertex_occurrences_array, PAGE_TYPE_PERCENTILE)
    is_vertex_start_or_end = np.logical_or(
        np.array(vertex_page_types) == "Start",
        np.array(vertex_page_types) == "End"
    )
    show_vertex = np.logical_or(is_vertex_frequent, is_vertex_start_or_end)
    vertex_filter = graph.new_vertex_property(str("boolean"), vals=show_vertex)
    graph.set_vertex_filter(vertex_filter)

    # Show only the top most taken transitions
    # This uses two conditions:
    # First, the transition has to have been taken a large number of times---the
    # number of occurrences must be within a certain percentile of all occurrences taken
    # Second, the transition has to have a certain minimum probability of occurring
    # edge_occurrences_array = np.array(edge_occurrences)
    edge_probabilities_array = np.array(edge_probabilities)
    # does_edge_occur_often = edge_occurrences_array >=\
    #     np.percentile(edge_occurrences_array, TRANSITION_PERCENTILE)
    does_edge_have_high_probability = edge_probabilities_array >= TRANSITION_PERCENTAGE_THRESHOLD
    # is_edge_frequent = np.logical_and(does_edge_occur_often, does_edge_have_high_probability)
    edge_filter = graph.new_edge_property(str("boolean"), vals=does_edge_have_high_probability)
    graph.set_edge_filter(edge_filter)

    # Create a new filename for the output that includes the index of the version of
    # data that was used when drawing it.
    output_filename = make_dump_filename(
        __name__ + "_compute_index_" + str(compute_index),
        "." + output_format,
    )

    # Draw the graph
    gt.graphviz_draw(
        graph,
        size=(30, 15),                 # resulting image should be about 30cm by 15cm
        overlap=False,                 # nodes should not be drawn on top of each other
        elen=.5,                       # edges should be ~1/2 in. long
        penwidth=edge_widths,          # edge thickness
        # vsize=vertex_sizes,          # vertex sizes
        vsize=MINIMUM_VERTEX_SIZE,     # vertex sizes
        layout='fdp',                  # this layout engine lets us set positions of start and end
        pin=vertex_pin_property,       # pins the positions for some vertices
        pos=vertex_position_property,  # set the position of some vertices
        vcolor=vertex_color_property,
        # For reference about graphviz vertex and edge properties in the next
        # two dictionaries, see this page:
        # http://www.graphviz.org/doc/info/attrs.html
        gprops={
            'rankdir': "LR",       # layout the vertices from left to right
            'splines': 'curved',
        },
        vprops={
            # 'fontsize': vertex_font_sizes,# size of labels
            'fontsize': MINIMUM_FONT_SIZE,  # size of labels
            'label': vertex_labels,         # text of labels
            'shape': 'circle',
            'fixedsize': 'shape',           # don't scale vertices to fit text (looks weird)
        },
        eprops={
            'xlabel': edge_labels,  # xlabel (instead of label) distances labels from edges
            'fontsize': 6.0,
            # Surprisingly, we have to explicitly set these arrow properties
            # to make sure taht edges appear with a direction
            'arrowhead': 'normal',
            'dir': 'forward',
        },
        output=output_filename,
        output_format=output_format,
    )