def parse_url(current_url, to_visit_set): # create the request log.info('Opening ' + current_url) try: req = Request(current_url, headers={'User-Agent': 'Mozilla/5.0'}) resp = urlopen(req) except IOError as e: log.error(e) return except UnicodeError as e: log.error(e) return except CertificateError as e: log.error(e) return # parse only html's for now if not 'text/html' in resp.headers['Content-Type']: return soup = BeautifulSoup(resp, 'html.parser') links = soup.find_all('a') current_split = urlsplit(current_url) for link in links: next_url = '' next_split = urlsplit(link.get('href')) scheme = next_split.scheme if next_split.scheme != '' else current_split.scheme if (scheme == 'http' or scheme == 'https'): next_url += scheme + '://' else: # no scheme, jump to next url continue netloc = next_split.netloc if next_split.netloc != '' else current_split.netloc if (netloc != ''): next_url += netloc else: # no netcol, jump to next url continue path = next_split.path if next_split.path != '' else current_split.path if (path != ''): next_url += path query = next_split.query if next_split.query != '' else current_split.query if (query != ''): next_url += '?' + query sanitized = sanitize_url(next_url) if not sanitized in to_visit_set: to_visit_set.add(sanitize_url(next_url)) next_split = urlsplit(next_url) if (current_split.netloc != next_split.netloc): G.add_connection(current_split.netloc, next_split.netloc) resp.close()
def make_graph2(): u = V('u') v = V('v') w = V('w') z = V('z') y = V('y') x = V('x') g = G().li(u, v).li(u, x).li(x, v).li(v, y).li(y, x).li(w, y).li(w, z)\ .li(z, z) print(g) return g
def G1(): g = G() s = V('s') #1 t = V('t') #2 y = V('y') #3 x = V('x') #4 z = V('z') #5 g.li(s, t, w=6).li(s, t, w=7).li(s,y,w=7).li(t, y, w=8)\ .li(t, x, w=5).li(x, t, w=-2).li(t, z, w=-4) g.li(z, s, w=2).li(z, x, w=7).li(y, x, w=-3)\ .li(y, z, w=9) return g
def make_graph(): r = V('r') v = V('v') s = V('s') w = V('w') t = V('t') x = V('x') u = V('u') y = V('y') g = G() g.add(s, r).add(r, v).add(s, w).add(w, t).add(w, x).add(t, x)\ .add(t, u).add(u, y).add(x, y).add(x, u) print(g) return g
def create_graph(): g = G() g.add_new_edge('S', 'B', 10) g.add_new_edge('S', 'B', 12) g.add_new_edge('C', 'B', 15) g.add_new_edge('C', 'A', 42) g.add_new_edge('A', 'P', 27) g.add_new_edge('A', 'F', 44) g.add_new_edge('F', 'L', 26) g.add_new_edge('F', 'L', 44) g.add_new_edge('F', 'G', 26) g.add_new_edge('L', 'E', 1) g.add_new_edge('B', 'D', 18) g.add_new_edge('E', 'D', 9) g.add_new_edge('E', 'M', 49) g.add_new_edge('E', 'I', 2) g.add_new_edge('K', 'I', 32) return g
import sys from graph import V, G def relax(g, u, v): w = g.weight(u, v) if v.d > u.d + w: v.d = u.d + w v.p = u def init(g, source) if __name__ == '__main__': g = G() g.li(V('s'), V('t'), w=6) g.li(V('s'), V('y'), w=7) g.li(V('t'), V('y'), w=8) g.li(V('t'), V('x'), w=5) g.li(V('x'), V('t'), w=-2) g.li(V('t'), V('z'), w=-4) g.li(V('z'), V('s'), w=2) g.li(V('z'), V('x'), w=7) g.li(V('y'), V('x'), w=-3) g.li(V('y'), V('z'), w=9) print("Graph: %s" % g)
#!/usr/bin/env python3 from utils import logging, configuration from crawl import crawler from graph import G log = logging.getLogger('root') if __name__ == '__main__': G.start() crawler.start_crawl("http://www.fluierul.ro") else: print(configuration.get_prop('prop1'))