def getFirstUrlFromPageSource(start_url): from urlister import URLister import urllib from sgmllib import SGMLParseError first_url = "" try: usock = urllib.urlopen(start_url) parser = URLister() parser.feed(usock.read()) except IOError: print "open url error" except SGMLParseError: print "parser feed error" finally: usock.close() parser.close() urls = parser.getUrl() for url in urls: if url.find("nothing") >= 0: temp = url.split("=") first_url = PREFIX_URL + temp[-1] break return first_url
def getPickleSrcFromURL(url): from urlister import URLister import urllib from sgmllib import SGMLParseError try: usock = urllib.urlopen(url) parser = URLister() parser.feed(usock.read()) unpickle_url = PREFIX + parser.getUrl()[0] usock = urllib.urlopen(unpickle_url) unpickle_src = usock.read() except IOError: print "open url error" except SGMLParseError: print "Parser Error" finally: usock.close() parser.close() return unpickle_src