Exemplo n.º 1
0
def parse_url(current_url, to_visit_set):
    # create the request
    log.info('Opening ' + current_url)

    try:
        req = Request(current_url, headers={'User-Agent': 'Mozilla/5.0'})
        resp = urlopen(req)
    except IOError as e:
        log.error(e)
        return
    except UnicodeError as e:
        log.error(e)
        return
    except CertificateError as e:
        log.error(e)
        return

    # parse only html's for now
    if not 'text/html' in resp.headers['Content-Type']:
        return

    soup = BeautifulSoup(resp, 'html.parser')
    links = soup.find_all('a')

    current_split = urlsplit(current_url)

    for link in links:
        next_url = ''
        next_split = urlsplit(link.get('href'))

        scheme = next_split.scheme if next_split.scheme != '' else current_split.scheme
        if (scheme == 'http' or scheme == 'https'):
            next_url += scheme + '://'
        else:
            # no scheme, jump to next url
            continue

        netloc = next_split.netloc if next_split.netloc != '' else current_split.netloc
        if (netloc != ''):
            next_url += netloc
        else:
            # no netcol, jump to next url
            continue

        path = next_split.path if next_split.path != '' else current_split.path
        if (path != ''):
            next_url += path

        query = next_split.query if next_split.query != '' else current_split.query
        if (query != ''):
            next_url += '?' + query

        sanitized = sanitize_url(next_url)
        if not sanitized in to_visit_set:
            to_visit_set.add(sanitize_url(next_url))
            next_split = urlsplit(next_url)
            if (current_split.netloc != next_split.netloc):
                G.add_connection(current_split.netloc, next_split.netloc)

    resp.close()
Exemplo n.º 2
0
def make_graph2():
    u = V('u')
    v = V('v')
    w = V('w')
    z = V('z')
    y = V('y')
    x = V('x')
    g = G().li(u, v).li(u, x).li(x, v).li(v, y).li(y, x).li(w, y).li(w, z)\
        .li(z, z)
    print(g)
    return g
Exemplo n.º 3
0
def G1():
    g = G()
    s = V('s')  #1
    t = V('t')  #2
    y = V('y')  #3
    x = V('x')  #4
    z = V('z')  #5
    g.li(s, t, w=6).li(s, t, w=7).li(s,y,w=7).li(t, y, w=8)\
    .li(t, x, w=5).li(x, t, w=-2).li(t, z, w=-4)
    g.li(z, s, w=2).li(z, x, w=7).li(y, x, w=-3)\
    .li(y, z, w=9)
    return g
Exemplo n.º 4
0
def make_graph():
    r = V('r')
    v = V('v')
    s = V('s')
    w = V('w')
    t = V('t')
    x = V('x')
    u = V('u')
    y = V('y')
    g = G()
    g.add(s, r).add(r, v).add(s, w).add(w, t).add(w, x).add(t, x)\
        .add(t, u).add(u, y).add(x, y).add(x, u)
    print(g)
    return g
Exemplo n.º 5
0
def create_graph():
    g = G()
    g.add_new_edge('S', 'B', 10)
    g.add_new_edge('S', 'B', 12)
    g.add_new_edge('C', 'B', 15)
    g.add_new_edge('C', 'A', 42)
    g.add_new_edge('A', 'P', 27)
    g.add_new_edge('A', 'F', 44)
    g.add_new_edge('F', 'L', 26)
    g.add_new_edge('F', 'L', 44)
    g.add_new_edge('F', 'G', 26)
    g.add_new_edge('L', 'E', 1)
    g.add_new_edge('B', 'D', 18)
    g.add_new_edge('E', 'D', 9)
    g.add_new_edge('E', 'M', 49)
    g.add_new_edge('E', 'I', 2)
    g.add_new_edge('K', 'I', 32)

    return g
Exemplo n.º 6
0
import sys
from graph import V, G


def relax(g, u, v):
    w = g.weight(u, v)
    if v.d > u.d + w:
        v.d = u.d + w
        v.p = u

def init(g, source) 


if __name__ == '__main__':

    g = G()
    g.li(V('s'), V('t'), w=6)
    g.li(V('s'), V('y'), w=7)
    g.li(V('t'), V('y'), w=8)
    g.li(V('t'), V('x'), w=5)
    g.li(V('x'), V('t'), w=-2)
    g.li(V('t'), V('z'), w=-4)
    g.li(V('z'), V('s'), w=2)
    g.li(V('z'), V('x'), w=7)
    g.li(V('y'), V('x'), w=-3)
    g.li(V('y'), V('z'), w=9)

    print("Graph: %s" % g)
Exemplo n.º 7
0
#!/usr/bin/env python3

from utils import logging, configuration
from crawl import crawler
from graph import G

log = logging.getLogger('root')

if __name__ == '__main__':
    
    G.start()
    
    crawler.start_crawl("http://www.fluierul.ro")
else:
    print(configuration.get_prop('prop1'))