Пример #1
0
def get_all_degree_links(degree, start_degree_pages, end_link, path,
                         reverse_path_last_degree):
    current_degree_pages = set()
    #if no more degrees to go return all links of current degree
    if degree == 0:
        return start_degree_pages
    #for each page in current degree get all URLS
    for page in start_degree_pages:
        all_pages = web_scraper.getAllUrl(page)

        #remove all duplicate pages that we already checked
        for degrees in path:
            all_pages.difference_update(degrees)

        current_degree_pages = current_degree_pages.union(all_pages)
        #if end link found return
        if end_link in all_pages:
            print("Found the Page in get_all_degree_links")
            return current_degree_pages.union(all_pages)

        #check for overlap
        overlap = all_pages & reverse_path_last_degree
        if len(overlap) > 0:
            print("Found a Overlap")
            print(overlap)
            return current_degree_pages.union(all_pages)

    return get_all_degree_links(degree - 1, current_degree_pages, end_link,
                                path, reverse_path_last_degree)
Пример #2
0
def get_all_degree_links(degree, start_degree_pages):
    current_degree_pages = []
    if degree == 0:
        return start_degree_pages
    for page in start_degree_pages:
        current_degree_pages += web_scraper.getAllUrl(page)
    return get_all_degree_links(degree - 1, current_degree_pages)
Пример #3
0
def get_all_degree_links(degree, start_degree_pages, end_link):
    current_degree_pages = []
    #if no more degrees to go return all links of current degree
    if degree == 0:
        return start_degree_pages
    #for each page in current degree get all URLS
    for page in start_degree_pages:
        all_pages = web_scraper.getAllUrl(page)
        #if end link found return
        if end_link in all_pages:
            return list(current_degree_pages) + list(all_pages)
        current_degree_pages += all_pages
    #do some for next degree
    return get_all_degree_links(degree - 1, current_degree_pages, end_link)
Пример #4
0
def is_link_in_page(page_to_check, end_link):
    links_on_page = web_scraper.getAllUrl(page_to_check)
    if end_link in links_on_page:
        return True
    else:
        return False
Пример #5
0
#Goes through the whole graph and finds all the paths and the filters to the shortest path

import web_scraper

start_link = "/wiki/Germany"
end_link = "/wiki/Shanghai"

graph = {'A': ['B', 'C'],
             'B': ['C', 'D'],
             'C': ['D'],
             'D': ['C'],
             'E': ['F'],
             'F': ['C']}

web_scraper.getAllUrl(start_link)

def find_shortest_path(graph, start, end, path=[]):
        path = path + [start]
        if start == end:
            return path
        # if start not in graph:
        #     return None
        shortest = None
        for node in graph[start]:
            if node not in path:
                newpath = find_shortest_path(graph, node, end, path)
                if newpath:
                    if not shortest or len(newpath) < len(shortest):
                        shortest = newpath
        return shortest
Пример #6
0
import sys

def is_link_in_page(page_to_check, end_link):
    links_on_page = web_scraper.getAllUrl(page_to_check)
    if end_link in links_on_page:
        return True
    else:
        return False


start_link = "/wiki/Flour"
link = start_link
end_link = "/wiki/Flour"
path = [[start_link]]

links_on_start_page = web_scraper.getAllUrl(start_link)


def get_all_degree_links(degree, start_degree_pages):
    current_degree_pages = []
    if degree == 0:
        return start_degree_pages
    for page in start_degree_pages:
        current_degree_pages += web_scraper.getAllUrl(page)
    return get_all_degree_links(degree - 1, current_degree_pages)


degree_count = 0
while True:
    path.append(get_all_degree_links(degree_count, start_link[degree_count]))
    if end_link in path[degree_count]):