Exemplo n.º 1
0

url= 'http://rfi.fr/afrique'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),  'html.parser')

sections = soup.findAll('section', {'id':'news'})

#anchors = [td.find('a') for td in soup.findAll('li', {'data-bo-type':'article'})]
articles = soup.findAll('li', {'data-bo-type':'article'})
print 'Number of article:', len(articles)
for article in articles:
	if article:
		a = article.find('a')
		if a.get('title'):
			post = Article()
			print a.get('title').encode('utf-8')
			post.title =  a.get('title').encode('utf-8')

			print 'Link:', a['href']
			post.link = 'http://rfi.fr' + a['href']

			print a.get('data-height')
			if a.get('data-image'):
				print 'Image:', a.get('data-image')
				post.thumbnail =  a.get('data-image')

			posts = Article.objects.filter(link = post.link)
			if posts.count()==0:
				post.source = 'RFI Afrique'
				post.view_count = 0
Exemplo n.º 2
0
reload(sys)
sys.setdefaultencoding('utf8')

from news_app.models import Article


url= 'http://cameroon-info.net'
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(),  'html.parser')
#soup = BeautifulSoup(page.read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
#soup = soup.prettify(formatter="html")
articles = soup.find_all('td', {'width': '475'})
for a in articles[7:]:
	print '-'*60
	if a:
		post = Article()
		link = a.find('a', {'class': 'morehltitle2012'})
		if link :
			print 'Link ', link.get('href')
			post.link = url + link.get('href')

		desc = a.find('div', {'class': 'morehldesc'})
		if desc :
			print 'Title:\n', desc.get_text().encode('utf-8')
			post.title = desc.get_text().encode('utf-8')
			if desc.img:
				print '\n\nImage', url + desc.img.get('src')
				post.thumbnail = url + desc.img.get('src')
		source = a.find('div', {'class': 'morehlsource'})
		if source:
			print source.encode('utf-8')