def count_rss(hour_count_source, rss): cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) soup = BeautifulSoup(rss) count = 0 for item in soup.find_all("item"): #check date is in last hour date_text = item.find("pubdate").text if date_text[-3:] != "GMT": date_text = date_text[:-3] + " GMT" the_time = datetime.datetime.strptime(date_text, "%a, %d %b %Y %H:%M:%S %Z") the_time = the_time + datetime.timedelta(hours=hour_count_source.source.hour_offset) if in_last_hour(the_time): link = item.find("link").text if link not in hour_count_source.hour_count.story.todays_links_urls(): try: html = str(opener.open(link).read()).lower().decode('utf-8') except BadStatusLine: html = "" except UnicodeDecodeError: html = "" if hour_count_source.source.start_text in html: html = html.split(hour_count_source.source.start_text, 1)[1] if hour_count_source.source.end_text in html: html = html.split(hour_count_source.source.end_text, 1)[0] print link if html != "": for kw in hour_count_source.hour_count.story.keywords(): if " " + kw.keyword in html: count += 1 print kw.keyword # add as StoryLink sl = StoryLink(url=link, date=the_time, title=item.find("title").text, hour_count_source=hour_count_source) sl.save() break hour_count_source.count = count hour_count_source.save()
def handle(self, *args, **options): t = datetime.datetime.now() hour_counts = {} for story in Story.objects.all(): hc = HourCount(story=story, date=t) hc.save() hour_counts for source in Source.objects.all(): hcs = HourCountSource(hour_count=hc, source=source, count=0) hcs.save() cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) for source in Source.objects.all(): rss = str(urllib2.urlopen(source.rss_url).read()) soup = BeautifulSoup(rss) for item in soup.find_all("item"): print item.find("title") #check date is in last hour date_text = item.find("pubdate").text if date_text[-3:] != "GMT": date_text = date_text[:-3] + " GMT" the_time = datetime.datetime.strptime(date_text, "%a, %d %b %Y %H:%M:%S %Z") the_time = the_time + datetime.timedelta(hours=source.hour_offset) last24 = start_time = datetime.datetime.now() + datetime.timedelta(hours=-24) if in_last_hour(the_time): link = item.find("link").text title = item.find("title").text if len(StoryLink.objects.filter(date__gte=last24, url=link)) == 0 and len(StoryLink.objects.filter(url=link)) == 0: if len(StoryLink.objects.filter(date__gte=last24, title=title)) == 0: try: html = str(opener.open(link).read()).lower().decode('utf-8') except BadStatusLine: html = "" except UnicodeDecodeError: html = "" if source.start_text in html: html = html.split(source.start_text, 1)[1] if source.end_text in html: html = html.split(source.end_text, 1)[0] if html != "": for story in Story.objects.all(): for kw in Keyword.objects.filter(story=story): kwFound = False if " " + kw.keyword in item.find("description").text or kw.keyword + " " in item.find("description").text kwFound = True if " " + kw.keyword in item.find("title").text or kw.keyword + " " in item.find("title").text kwFound = True if kwFound: hcs = HourCountSource.objects.filter(source=source, hour_count__story=kw.story, hour_count__date=t)[0] hcs.count += 1 hcs.save() # add as StoryLink sl = StoryLink(url=link, date=the_time, title=item.find("title").text, hour_count_source=hcs) sl.save() break else: #if not on last hour then go to next source break