예제 #1
0
def getFirstUrlFromPageSource(start_url):
    from urlister import URLister
    import urllib
    from sgmllib import SGMLParseError

    first_url = ""
    try:
        usock = urllib.urlopen(start_url)
        parser = URLister()
        parser.feed(usock.read())
    except IOError:
        print "open url error"
    except SGMLParseError:
        print "parser feed error"
    finally:
        usock.close()
        parser.close()

    urls = parser.getUrl()

    for url in urls:
        if url.find("nothing") >= 0:
            temp = url.split("=")
            first_url = PREFIX_URL + temp[-1]
            break

    return first_url
예제 #2
0
def getFirstUrlFromPageSource(start_url):
    from urlister import URLister
    import urllib
    from sgmllib import SGMLParseError
    
    first_url = ""
    try:
        usock = urllib.urlopen(start_url)
        parser = URLister()
        parser.feed(usock.read())
    except IOError:
        print "open url error"
    except SGMLParseError:
        print "parser feed error"
    finally:
        usock.close()
        parser.close()
    
    urls = parser.getUrl()
    
    for url in urls:
        if url.find("nothing") >= 0:
            temp = url.split("=")
            first_url = PREFIX_URL + temp[-1]
            break
        
    return first_url
예제 #3
0
def getPickleSrcFromURL(url):
    from urlister import URLister
    import urllib
    from sgmllib import SGMLParseError
    
    try:
        usock = urllib.urlopen(url)
        parser = URLister()
        parser.feed(usock.read())
        
        unpickle_url = PREFIX + parser.getUrl()[0]
        usock = urllib.urlopen(unpickle_url)
        unpickle_src = usock.read()
    except IOError:
        print "open url error"
    except SGMLParseError:
        print "Parser Error"
    finally:
        usock.close()
        parser.close()
        
    return unpickle_src