def main(first_url,max): ''' Engine is the class that fetches a url from global Store of URLs and calls downloader to find URLs in the page of that url''' engine = Engine(first_url,max) ''' This function sets the first URL in the global store afterchecking it ''' engine.set_first() ''' The next 4 lines starts the retrival process of URLs if the first URL was ok ''' if len(engine.store.URLs)>0: engine.start() else: print "The first URL you entered was not correct" change_Dir = Change_Dir('URLs') file_w = open('URL_Names.txt','w') for url in engine.store.URLs: print url file_w.write(url) file_w.write('\n') file_w.close() change_Dir.__del__()
def start(self): change_Dir = Change_Dir('Text') filename = str(self.url) filename = re.sub(r'[^a-zA-Z0-9 ]', '', filename)[:40] print 'Writing in file : ',filename,'.txt' f = open(filename + '.txt','w') try: html = urlopen(self.url).read() raw = nltk.clean_html(html) raw = re.sub(r' +',' ',raw) list = raw.splitlines() for line in list: if len(line)>1: f.write(line) f.write('\n') except IOError: pass except HTMLParser.HTMLParseError: pass f.close() change_Dir.__del__()
def start(self): change_Dir = Change_Dir('image') try: page = urllib.urlopen(self.url) soup = BeautifulSoup(page) for img in soup.findAll('img'): img_url = urlparse.urljoin(self.url, img['src']) if img_url not in Img_Downloader.store: print "Image found : %(src)s" % img filename = img["src"].split("/")[-1] response = urllib.urlopen(img_url) f = open(filename,'wb') f.write(response.read()) f.close() Img_Downloader.store.append(img_url) except IOError: pass except HTMLParser.HTMLParseError: pass change_Dir.__del__()