Пример #1
0
def test():
	t = datetime.datetime.now()
	rss_data = {}
	for story in Story.objects.all():
		hc = HourCount(story=story, date=t)
		hc.save()
		for source in Source.objects.all():
			if source.pk not in rss_data:
				rss_data[source.pk] = str(urllib2.urlopen(hour_count_source.source.rss_url).read())
			hcs = HourCountSource(hour_count=hc, source=source, count=0)
			hcs.save()
			count_rss(hcs, rss_data[source.pk])
Пример #2
0
	def handle(self, *args, **options):
		t = datetime.datetime.now()
		hour_counts = {}
		for story in Story.objects.all():
			hc = HourCount(story=story, date=t)
			hc.save()
			hour_counts
			for source in Source.objects.all():
				hcs = HourCountSource(hour_count=hc, source=source, count=0)
				hcs.save()

		cj = CookieJar()
		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
		for source in Source.objects.all():
			rss = str(urllib2.urlopen(source.rss_url).read())

			soup = BeautifulSoup(rss)
			for item in soup.find_all("item"):
				print item.find("title")
				#check date is in last hour
				date_text = item.find("pubdate").text
				
				if date_text[-3:] != "GMT":
					date_text =  date_text[:-3] + " GMT"
				

				the_time = datetime.datetime.strptime(date_text, "%a, %d %b %Y %H:%M:%S %Z")
				the_time = the_time + datetime.timedelta(hours=source.hour_offset)

				last24 = start_time = datetime.datetime.now() + datetime.timedelta(hours=-24)
				if in_last_hour(the_time):
					link = item.find("link").text
					title = item.find("title").text
					if len(StoryLink.objects.filter(date__gte=last24, url=link)) == 0 and len(StoryLink.objects.filter(url=link)) == 0:
						if len(StoryLink.objects.filter(date__gte=last24, title=title)) == 0:
							try:
								html = str(opener.open(link).read()).lower().decode('utf-8')
							except BadStatusLine:
								html = ""
							except UnicodeDecodeError:
								html = ""
							if source.start_text in html:
								html = html.split(source.start_text, 1)[1]
							if source.end_text in html:
								html = html.split(source.end_text, 1)[0]
							if html != "":
								for story in Story.objects.all():
									for kw in Keyword.objects.filter(story=story):
										kwFound = False
										if " " + kw.keyword in item.find("description").text or kw.keyword + " " in item.find("description").text
											kwFound = True
										if " " + kw.keyword in item.find("title").text or kw.keyword + " " in item.find("title").text
											kwFound = True
										
										if kwFound:
											hcs = HourCountSource.objects.filter(source=source,
																			  	 hour_count__story=kw.story,
																			  	 hour_count__date=t)[0]
											hcs.count += 1
											hcs.save()
											# add as StoryLink
											sl = StoryLink(url=link,
														   date=the_time,
														   title=item.find("title").text,
														   hour_count_source=hcs)
											sl.save()
											break
					else:
						#if not on last hour then go to next source
						break