Пример #1
0
	def storeNewLinkInMERGEandHTML(self, file_id, rssResource, page, title, firstPage_link, link, page_num, date):
		try:
			file_name = hashlib.md5(title.encode('utf-8')).hexdigest()
		except:
			pass
		# print "file name = "+file_name
		try:
			myFile = open(self.sub_ppath  + str(file_id)+'.'+file_name+'.html', 'w')
			myFile.write(page)
			language = 'English'
			sourcename = self.RSSName
			if self.RSSName in self.config['multisource']:
				sourcename,language = SpecialSites.getNameAndLanguageFromResource(rssResource,sourcename,language)

			data =  { 'source':rssResource, 'language': language, 'sourcename':sourcename, 'firstPage_link':firstPage_link, 'page': page_num, 'title': title, 'timestamp-sec': date}
			content = json.dumps(data)
			new_line_webpage = str(file_id)+'.'+file_name+'.html'+'\t' + 'html'+'\t'+ str(file_id)+'\t' + link + '\t' + link + '\t' + content + '\t' + '0' + '\t' + '-1'
			myFile2 = open(self.sub_ppath  + self.config['storagefile'], 'a')
			myFile2.write(new_line_webpage)
			myFile2.write('\n')

		except UnicodeEncodeError:
			logging.warning("there is a UnicodeEncodeError")
			return 'unicode error'
		finally:
			myFile.close()
			myFile2.close()
		return None
Пример #2
0
	def fetchNews(self):
		for rssResource in self.sources:
			# print "rssResource = "+rssResource
			new_links = []
			fileName = self.replaceAll4FileName(rssResource)

			self.createAllFetchedLinks(fileName)

			d = self.fetchXML(rssResource)
			if d == "wrong url":
				continue

			# fetch all items from a RSS source 
			for dd in d.entries:
				ex = NewsBlogExtractor()
				page_num = 1
				page, link, file_id= self.fetchWebpage(dd.link)
				firstPage_link = link
				while link != None:
					time.sleep(0.1)					
					# special processing for some RSS sources
					if self.RSSName in self.config['specialsites']:
						# page, link = globals()["SpecialSites." + self.RSSName](page, link)
						if self.RSSName == 'newyorktimes':
							page, link = SpecialSites.newyorktimes(page, link)
						elif self.RSSName == 'straitstimes':
							page, link = SpecialSites.straitstimes(page, link)

					if (page == None) or (len(page) == 0):
						break # next item

					if self.determineDuplication(fileName,link) == 'False':
						#  insert the new  link into database
						self.updateNewLinks(fileName, link)

						# store a new link in a special file , such as, MERGE.TXT, and store its whole webpage in a HTML file
						self.storeNewLinkInMERGEandHTML(file_id, rssResource, page, dd.title, firstPage_link, link, page_num, str(time.mktime(dd.published_parsed)+self.config['timezonedifference']*3600))

						# special processing for special sites
						language = 'English'
						sourcename = self.RSSName
						if self.RSSName in self.config['multisource']:
							sourcename,language = SpecialSites.getNameAndLanguageFromResource(rssResource,sourcename,language)

						# store all images in the webpage
						if rssinstance.config['imagestorage'] == "True":
							o = urlparse(link)
							images = ex.findAllImages(page, o.netloc, sourcename)
							# print images
							self.fetchAllImages(images, link)

						# calculate word frequency in title
						if self.config['wordsFrequency'] == "True":
							self.calWordsFrequency(dd.title, str(dd.published_parsed[2]))

						new_links.append(link)

						# process multiple pages
						link = ex.findNextPage(page, sourcename)
						if link != None:
							page_num +=1
							page, link,file_id = self.fetchWebpage(link)
					else:
						break

			if (len(new_links) !=0) :
				# update words frequency record file
				if self.config['wordsFrequency'] == "True":
					self.updateWordsFrequency()

			time.sleep(1)
		return None