Python standardize_url примеры, dump._urls.standardize_url Python примеры использования

Пример #1

0

Показать файл

 def test_panda3d_screenshots_show_keep_shot_name(self):
     # We found that some of our experimental data includes slashes in the name
     # of the shot, so we replicate that here.
     standardized = standardize_url(
         "http://panda3d.org/showss.php?shot=path/to/photo&otherparam=1")
     self.assertEqual(standardized,
                      "panda3d.org/showss.php?shot=path/to/photo")

Пример #2

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_panda3d_screenshots_show_keep_shot_name(self):
     # We found that some of our experimental data includes slashes in the name
     # of the shot, so we replicate that here.
     standardized = standardize_url(
         "http://panda3d.org/showss.php?shot=path/to/photo&otherparam=1"
     )
     self.assertEqual(standardized, "panda3d.org/showss.php?shot=path/to/photo")

Пример #3

0

Показать файл

Файл: navigation_ngrams.py Проект: andrewhead/Search-Task-Analysis

def compute_navigation_ngrams(length, page_type_lookup):
    '''
    Compute n-grams of sequences of pages visited, of a certain length.
    A `page_type_lookup` dictionary must be provided, that maps URLs to their page types.
    '''
    # Create a new index for this computation
    last_compute_index = NavigationNgram.select(fn.Max(NavigationNgram.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(LocationVisit.compute_index == visit_compute_index)

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to a vertex.  For each transition from one page to the next,
    # increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            # Create a list of unique URLs that each participant visited
            urls = [visit.url for visit in participant_concern_visits]
            standardized_urls = [standardize_url(url) for url in urls]

            # Create a list of all page types visited.
            # If this is a redirect, then skip it.  For all intents and purposes,
            # someone is traveling between two the page type before and after it.
            page_types = []
            for url in standardized_urls:
                if url in page_type_lookup:
                    url_info = page_type_lookup[url]
                    if not url_info['redirect']:
                        page_types.append(url_info['main_type'])
                else:
                    logger.warn("URL %s not in page type lookup.  Giving it 'Unknown' type", url)
                    page_types.append("Unknown")

            # Compute n-grams using NLTK command
            ngrams = nltk_compute_ngrams(page_types, length)

            # Save each n-gram to the database
            for ngram_tuple in ngrams:
                NavigationNgram.create(
                    compute_index=compute_index,
                    user_id=participant_id,
                    concern_index=concern_index,
                    length=length,
                    ngram=", ".join(ngram_tuple),
                )

Пример #4

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_experiment_site_is_reduced_to_its_domain(self):
     standardized = standardize_url("http://searchlogger.tutorons.com/long/path#with-fragment")
     self.assertEqual(standardized, "searchlogger.tutorons.com")

Пример #5

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_yahoo_redirect_link_reduced_to_standard_path(self):
     standardized = standardize_url("http://r.search.yahoo.com/_ylt=AwrTccsomething")
     self.assertEqual(standardized, "r.search.yahoo.com/_ylt=redirect")

Пример #6

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_google_groups_forums_keep_fragment(self):
     standardized = standardize_url("http://groups.google.com/forum/#!forum/keras-users")
     self.assertEqual(standardized, "groups.google.com/forum/!forum/keras-users")

Пример #7

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_tigsource_forums_keep_board_id(self):
     standardized = standardize_url("http://forums.tigsource.com/index.php?board=20.0")
     self.assertEqual(standardized, "forums.tigsource.com/index.php?board=20.0")

Пример #8

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_panda3d_forums_keep_forum_id(self):
     standardized = standardize_url("http://panda3d.org/viewforum.php?f=1")
     self.assertEqual(standardized, "panda3d.org/viewforum.php?f=1")

Пример #9

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_youtube_videos_preserve_v_parameter(self):
     standardized = standardize_url("http://youtube.com/watch?v=DRR9fOXkfRE")
     self.assertEqual(standardized, "youtube.com/watch?v=DRR9fOXkfRE")

Пример #10

0

Показать файл

Файл: unique_urls.py Проект: andrewhead/Search-Task-Analysis

def compute_unique_urls(page_type_lookup, exclude_users=None):

    exclude_users = [] if exclude_users is None else exclude_users

    # Create a new index for this computation
    last_compute_index = UniqueUrl.select(fn.Max(
        UniqueUrl.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index,
        LocationVisit.user_id.not_in(exclude_users),
    )

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])

    # Go through every concern for every participant.  Find the number of URLs
    # they visited that no one else visited.
    for participant_id in participant_ids:

        participant_concern_visits = visits.where(
            LocationVisit.user_id == participant_id)
        others_visits = visits.where(LocationVisit.user_id != participant_id)

        # Create a list of unique URLs that this participant visited
        participant_urls = [visit.url for visit in participant_concern_visits]
        participant_standardized_urls = [
            standardize_url(url) for url in participant_urls
        ]

        # Create a list of unique URLs that all others visited
        others_urls = [visit.url for visit in others_visits]
        others_standardized_urls = [
            standardize_url(url) for url in others_urls
        ]

        # Compute the URLs that this participant visited uniquely, and that they share with others
        unique_participant_urls = set(participant_standardized_urls) - set(
            others_standardized_urls)
        shared_participant_urls = set(participant_standardized_urls) - set(
            unique_participant_urls)

        # Save all URLs that the participant visited to the database, including
        # whether they visited them uniquely.
        for url in unique_participant_urls:
            UniqueUrl.create(
                compute_index=compute_index,
                user_id=participant_id,
                url=url,
                unique=True,
            )

        for url in shared_participant_urls:
            UniqueUrl.create(
                compute_index=compute_index,
                user_id=participant_id,
                url=url,
                unique=False,
            )

Пример #11

0

Показать файл

 def test_by_default_standard_url_is_domain_and_path(self):
     standardized = standardize_url("http://site.com/path#fragment?q=query")
     self.assertEqual(standardized, "site.com/path")

Пример #12

0

Показать файл

Файл: navigation_ngrams.py Проект: andrewhead/Search-Task-Analysis

def compute_navigation_ngrams(length, page_type_lookup):
    '''
    Compute n-grams of sequences of pages visited, of a certain length.
    A `page_type_lookup` dictionary must be provided, that maps URLs to their page types.
    '''
    # Create a new index for this computation
    last_compute_index = NavigationNgram.select(
        fn.Max(NavigationNgram.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index)

    # Get the distinct participant IDs and concern indexes
    participant_ids = set([visit.user_id for visit in visits])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to a vertex.  For each transition from one page to the next,
    # increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            # Create a list of unique URLs that each participant visited
            urls = [visit.url for visit in participant_concern_visits]
            standardized_urls = [standardize_url(url) for url in urls]

            # Create a list of all page types visited.
            # If this is a redirect, then skip it.  For all intents and purposes,
            # someone is traveling between two the page type before and after it.
            page_types = []
            for url in standardized_urls:
                if url in page_type_lookup:
                    url_info = page_type_lookup[url]
                    if not url_info['redirect']:
                        page_types.append(url_info['main_type'])
                else:
                    logger.warn(
                        "URL %s not in page type lookup.  Giving it 'Unknown' type",
                        url)
                    page_types.append("Unknown")

            # Compute n-grams using NLTK command
            ngrams = nltk_compute_ngrams(page_types, length)

            # Save each n-gram to the database
            for ngram_tuple in ngrams:
                NavigationNgram.create(
                    compute_index=compute_index,
                    user_id=participant_id,
                    concern_index=concern_index,
                    length=length,
                    ngram=", ".join(ngram_tuple),
                )

Пример #13

0

Показать файл

 def test_youtube_videos_preserve_v_parameter(self):
     standardized = standardize_url(
         "http://youtube.com/watch?v=DRR9fOXkfRE")
     self.assertEqual(standardized, "youtube.com/watch?v=DRR9fOXkfRE")

Пример #14

0

Показать файл

 def test_panda3d_forums_keep_forum_id(self):
     standardized = standardize_url("http://panda3d.org/viewforum.php?f=1")
     self.assertEqual(standardized, "panda3d.org/viewforum.php?f=1")

Пример #15

0

Показать файл

 def test_panda3d_topics_keep_topic_id(self):
     standardized = standardize_url(
         "http://panda3d.org/viewtopic.php?f=1&t=2")
     self.assertEqual(standardized, "panda3d.org/viewtopic.php?t=2")

Пример #16

0

Показать файл

 def test_standardization_removes_www_prefix_and_schema(self):
     standardized = standardize_url("http://www.site.com")
     self.assertEqual(standardized, "site.com")

Пример #17

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_pages_viewed_as_source_gets_view_source_prepended(self):
     standardized = standardize_url("view-source:http://site.com")
     self.assertEqual(standardized, "view-source:site.com")

Пример #18

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_standardization_removes_www_prefix_and_schema(self):
     standardized = standardize_url("http://www.site.com")
     self.assertEqual(standardized, "site.com")

Пример #19

0

Показать файл

 def test_google_groups_topic_keep_fragment(self):
     standardized = standardize_url(
         "http://groups.google.com/forum/#!topic/keras-users/epFdzcxl8Gg")
     self.assertEqual(
         standardized,
         "groups.google.com/forum/!topic/keras-users/epFdzcxl8Gg")

Пример #20

0

Показать файл

 def test_bluejeans_site_standardizes_to_bluejeans_domain(self):
     standardized = standardize_url(
         "http://bluejeans.com/long-path#some-fragment")
     self.assertEqual(standardized, "bluejeans.com")

Пример #21

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_by_default_standard_url_is_domain_and_path(self):
     standardized = standardize_url("http://site.com/path#fragment?q=query")
     self.assertEqual(standardized, "site.com/path")

Пример #22

0

Показать файл

 def test_google_groups_forums_keep_fragment(self):
     standardized = standardize_url(
         "http://groups.google.com/forum/#!forum/keras-users")
     self.assertEqual(standardized,
                      "groups.google.com/forum/!forum/keras-users")

Пример #23

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_panda3d_topics_keep_topic_id(self):
     standardized = standardize_url("http://panda3d.org/viewtopic.php?f=1&t=2")
     self.assertEqual(standardized, "panda3d.org/viewtopic.php?t=2")

Пример #24

0

Показать файл

 def test_google_groups_simplify_search_url(self):
     standardized = standardize_url(
         "http://groups.google.com/forum/#!searchin/keras-users/model")
     self.assertEqual(standardized, "groups.google.com/forum/!searchin")

Пример #25

0

Показать файл

 def test_tigsource_forums_keep_board_id(self):
     standardized = standardize_url(
         "http://forums.tigsource.com/index.php?board=20.0")
     self.assertEqual(standardized,
                      "forums.tigsource.com/index.php?board=20.0")

Пример #26

0

Показать файл

 def test_tigsource_forums_preserve_search_query_parameter(self):
     standardized = standardize_url(
         "http://forums.tigsource.com/index.php?action=search2;params=PARAMS"
     )
     self.assertEqual(standardized,
                      "forums.tigsource.com/index.php?action=search2")

Пример #27

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_google_groups_topic_keep_fragment(self):
     standardized = standardize_url(
         "http://groups.google.com/forum/#!topic/keras-users/epFdzcxl8Gg"
     )
     self.assertEqual(standardized, "groups.google.com/forum/!topic/keras-users/epFdzcxl8Gg")

Пример #28

0

Показать файл

 def test_experiment_site_is_reduced_to_its_domain(self):
     standardized = standardize_url(
         "http://searchlogger.tutorons.com/long/path#with-fragment")
     self.assertEqual(standardized, "searchlogger.tutorons.com")

Пример #29

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_google_groups_simplify_search_url(self):
     standardized = standardize_url(
         "http://groups.google.com/forum/#!searchin/keras-users/model"
     )
     self.assertEqual(standardized, "groups.google.com/forum/!searchin")

Пример #30

0

Показать файл

 def test_browser_pages_are_reduced_to_the_words_browser_page(self):
     standardized = standardize_url("about:preferences")
     self.assertEqual(standardized, "browser_page")

Пример #31

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_tigsource_forums_preserve_search_query_parameter(self):
     standardized = standardize_url(
         "http://forums.tigsource.com/index.php?action=search2;params=PARAMS"
     )
     self.assertEqual(standardized, "forums.tigsource.com/index.php?action=search2")

Пример #32

0

Показать файл

 def test_yahoo_redirect_link_reduced_to_standard_path(self):
     standardized = standardize_url(
         "http://r.search.yahoo.com/_ylt=AwrTccsomething")
     self.assertEqual(standardized, "r.search.yahoo.com/_ylt=redirect")

Пример #33

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_browser_pages_are_reduced_to_the_words_browser_page(self):
     standardized = standardize_url("about:preferences")
     self.assertEqual(standardized, "browser_page")

Пример #34

0

Показать файл

 def test_stack_overflow_question_reduce_to_id_as_path(self):
     standardized = standardize_url(
         "http://stackoverflow.com/questions/1000/long-name")
     self.assertEqual(standardized, "stackoverflow.com/questions/1000")

Пример #35

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_stack_overflow_question_reduce_to_id_as_path(self):
     standardized = standardize_url("http://stackoverflow.com/questions/1000/long-name")
     self.assertEqual(standardized, "stackoverflow.com/questions/1000")

Пример #36

0

Показать файл

 def test_pages_viewed_as_source_gets_view_source_prepended(self):
     standardized = standardize_url("view-source:http://site.com")
     self.assertEqual(standardized, "view-source:site.com")

Пример #37

0

Показать файл

Файл: test_compute_url_labels.py Проект: andrewhead/Search-Task-Analysis

 def test_bluejeans_site_standardizes_to_bluejeans_domain(self):
     standardized = standardize_url("http://bluejeans.com/long-path#some-fragment")
     self.assertEqual(standardized, "bluejeans.com")

Пример #38

0

Показать файл

Файл: navigation_graph.py Проект: andrewhead/Search-Task-Analysis

def compute_navigation_graph(page_type_lookup,
                             exclude_users=None,
                             show_progress=False,
                             concern_index=None):

    exclude_users = [] if exclude_users is None else exclude_users

    # Create a new index for this computation
    last_compute_index = NavigationVertex.select(
        fn.Max(NavigationVertex.compute_index)).scalar() or 0
    compute_index = last_compute_index + 1

    # Fetch the set of visits for the most recently computed visits
    visit_compute_index = LocationVisit.select(
        fn.Max(LocationVisit.compute_index)).scalar()
    visits = LocationVisit.select().where(
        LocationVisit.compute_index == visit_compute_index)

    # If the user has provided a concern index that they want to compute the graph for,
    # then restrict navigation data to only that concern
    if concern_index is not None:
        visits = visits.where(LocationVisit.concern_index == concern_index)

    # Get the distinct participant IDs and concern indexes
    # Exclude any users that were not requested as part of the analysis
    participant_ids = set([
        visit.user_id for visit in visits if visit.user_id not in exclude_users
    ])
    concern_indexes = set([visit.concern_index for visit in visits])

    # Set up progress bar.
    total_iterations_count = len(participant_ids) * len(concern_indexes)
    if show_progress:
        progress_bar = ProgressBar(
            maxval=total_iterations_count,
            widgets=[
                'Progress: ',
                Percentage(), ' ',
                Bar(marker=RotatingMarker()), ' ',
                ETA(), ' Read ',
                Counter(), ' / ' + str(total_iterations_count) + ' sessions.'
            ])
        progress_bar.start()

    # The list of vertices needs to be populated with a start and end node.
    # All navigation behavior starts at the "Start" node, and ends at the "End" node
    vertices = {
        "Start": Vertex("Start", occurrences=1),
        "End": Vertex("End", occurrences=1),
    }
    edges = {}
    last_vertex = vertices["Start"]
    iterations_count = 0

    # Go through every concern for every participant.  For each page they visit,
    # increment the visits to the corresponding vertex.  For each transition from one
    # page to the next, increment the occurrence of a transition between two page types.
    for participant_id in participant_ids:
        for concern_index in concern_indexes:

            participant_concern_visits = visits.where(
                LocationVisit.user_id == participant_id,
                LocationVisit.concern_index == concern_index,
            ).order_by(LocationVisit.start.asc())

            for visit in participant_concern_visits:

                # Get the type of the page visited
                standardized_url = standardize_url(visit.url)
                if standardized_url in page_type_lookup:
                    url_info = page_type_lookup[standardized_url]
                    page_type = url_info['main_type']
                    # If this is a redirect, then just skip it.  It's more important
                    # to link the URL before it to the link the redirect points to.
                    if url_info['redirect']:
                        continue
                else:
                    logger.warn(
                        "URL %s not in page type lookup.  Giving it 'Unknown' type",
                        standardized_url)
                    page_type = "Unknown"

                # Add a new vertex for this page type if it doesn't exist
                if page_type not in vertices:
                    vertices[page_type] = Vertex(page_type)

                # Save that we have seen this page type one more time
                vertex = vertices[page_type]
                vertex.occurrences += 1

                # Add the time spent to the total time spent for this page type
                time_passed = visit.end - visit.start
                seconds = time_passed.seconds + (time_passed.microseconds /
                                                 float(1000000))
                vertex.total_time += seconds

                # Connect an edge between the last page visited and this one
                if (last_vertex.page_type, vertex.page_type) not in edges:
                    edges[(last_vertex.page_type,
                           vertex.page_type)] = Edge(last_vertex, vertex)
                edge = edges[(last_vertex.page_type, vertex.page_type)]
                edge.occurrences += 1

                # Redefine the last page so we know in the next iteration what was just visited.
                last_vertex = vertex

            # After each participant or each concern, connect from the last URL to the end vertex
            end_vertex = vertices['End']
            if (last_vertex.page_type, end_vertex.page_type) not in edges:
                edges[(last_vertex.page_type,
                       end_vertex.page_type)] = Edge(last_vertex, end_vertex)
            edge = edges[(last_vertex.page_type, end_vertex.page_type)]
            edge.occurrences += 1

            # After each participant or each concern, we reset the last_page_type to "Start"
            last_vertex = vertices['Start']

            if show_progress:
                iterations_count += 1
                progress_bar.update(iterations_count)

    # Compute the mean time spent on each vertex
    for vertex in vertices.values():
        vertex.mean_time = vertex.total_time / float(vertex.occurrences)

    # Compute the transition probability for each edge leaving a vertex.
    # First, group all edges by their source vertex
    get_source_page_type = lambda (source_type, target_type): source_type
    sorted_edge_keys = sorted(edges.keys(), key=get_source_page_type)
    edge_groups = itertools.groupby(sorted_edge_keys, get_source_page_type)

    for _, edge_group in edge_groups:

        # Fetch those edges in the current group
        # (Thos in the current group share the same source.)
        edge_keys = [_ for _ in edge_group]
        group_edges = dict(filter(lambda (k, v): k in edge_keys,
                                  edges.items()))

        # Compute the probability of each edge being taken
        total_occurrences = sum([e.occurrences for e in group_edges.values()])
        for edge in group_edges.values():
            edge.probability = float(edge.occurrences) / total_occurrences

    # Save all vertices to the database
    vertex_models = {}
    for vertex in vertices.values():
        vertex_model = NavigationVertex.create(
            compute_index=compute_index,
            page_type=vertex.page_type,
            occurrences=vertex.occurrences,
            total_time=vertex.total_time,
            mean_time=vertex.mean_time,
        )
        # We store a dictionary from page type to vertex model so
        # we can look up these models when saving the edges.
        vertex_models[vertex.page_type] = vertex_model

    # Save all edges to the database
    # We use a progress bar for this as there might be a lot of edges and
    # we upload each of them separately to the database.
    if show_progress:
        progress_bar = ProgressBar(maxval=len(edges),
                                   widgets=[
                                       'Progress: ',
                                       Percentage(), ' ',
                                       Bar(marker=RotatingMarker()), ' ',
                                       ETA(), ' Updated graph with ',
                                       Counter(),
                                       ' / ' + str(len(edges)) + ' edges.'
                                   ])
        progress_bar.start()

    for edge_index, edge in enumerate(edges.values(), start=1):
        NavigationEdge.create(
            compute_index=compute_index,
            source_vertex=vertex_models[edge.source_vertex.page_type],
            target_vertex=vertex_models[edge.target_vertex.page_type],
            occurrences=edge.occurrences,
            probability=edge.probability,
        )
        if show_progress:
            progress_bar.update(edge_index)

    if show_progress:
        progress_bar.finish()

    if show_progress:
        progress_bar.finish()

Python standardize_url примеры использования