示例#1
0
文件: main.py 项目: jk983294/Store
def main(first_url,max):
	''' Engine is the class that fetches a url from global Store of URLs and calls downloader to find URLs in the page of that url'''
	engine = Engine(first_url,max)
	''' This function sets the first URL in the global store afterchecking it '''
	engine.set_first()
	''' The next 4 lines starts the retrival process of URLs if the first URL was ok '''
	if len(engine.store.URLs)>0:
		engine.start()
	else:
		print "The first URL you entered was not correct"
	
	change_Dir = Change_Dir('URLs')
	file_w = open('URL_Names.txt','w')
	for url in engine.store.URLs:
		print url
		file_w.write(url)
		file_w.write('\n')
	file_w.close()
	change_Dir.__del__()
示例#2
0
	def start(self):
		change_Dir = Change_Dir('Text')
		filename = str(self.url)
		filename = re.sub(r'[^a-zA-Z0-9 ]', '', filename)[:40]
		print 'Writing in file : ',filename,'.txt'
		f = open(filename + '.txt','w')
		try:
			html = urlopen(self.url).read()
			raw = nltk.clean_html(html)
			raw = re.sub(r' +',' ',raw)
			list = raw.splitlines()
			for line in list:
				if len(line)>1:
					f.write(line)
					f.write('\n')
			
		except IOError:
			pass
		except HTMLParser.HTMLParseError:
			pass
			
		f.close()
		change_Dir.__del__()
示例#3
0
	def start(self):
		change_Dir = Change_Dir('image')
		try:
			page = urllib.urlopen(self.url)
			soup = BeautifulSoup(page)
			
			for img in soup.findAll('img'):
				img_url = urlparse.urljoin(self.url, img['src'])
				
				if img_url not in Img_Downloader.store:
					print "Image found : %(src)s" % img
					filename = img["src"].split("/")[-1]
					response = urllib.urlopen(img_url)
					f = open(filename,'wb')
					f.write(response.read())
					f.close()
					Img_Downloader.store.append(img_url)
					
		except IOError:
			pass
		except HTMLParser.HTMLParseError:
			pass
		
		change_Dir.__del__()