def update(self, feed): feed_data = feedparser.parse(feed.rss_url) try: feed.last_modified = struct_to_datetime(feed_data.feed.updated_parsed) except: feed.last_modified = parse_date(feed_data.headers.get('last-modified', datetime.now().strftime("%a, %d %b %Y %H:%M:%S +0000"))) feed.save() items = set() for x_item in feed_data.entries: guid, last_modified = x_item.id, datetime(*x_item.date_parsed[:7]) for i in items: if i.guid == guid: item = i break else: item = Item(guid=guid, last_modified=datetime(1900,1,1), feed=feed) if True or item.last_modified < last_modified: item.title = x_item.title item.description = sanitise_html(x_item.description) item.link = x_item.link item.last_modified = last_modified item.save() items.add(item) return items
def handle_noargs(self, **options): location_data = {} for feed in RSSFeed.events.all(): if not feed.rss_url.startswith('http://www.dailyinfo.co.uk/'): continue feed_data = feedparser.parse(feed.rss_url) items = list(feed.rssitem_set.all()) guids = set() for x_item in feed_data.entries: guid, last_modified = x_item.id, datetime(*x_item.date_parsed[:7]) #print x_item.link #if x_item.link != 'http://www.dailyinfo.co.uk/events.php?colname=Lectures%2C+Seminars+and+Conferences&period=7&eventday=10&eventmonth=12&eventyear=2009#70276': # continue print x_item.items() for i in items: if i.guid == guid: item = i break else: item = RSSItem(guid=guid, last_modified=datetime(1900,1,1), feed=feed) if True or item.last_modified < last_modified: item.title = x_item.title.split(': ', 1)[1] try: item.description = sanitise_html(Command.SUMMARY_RE.match(x_item.summary).groups(0)[0]) except: item.description = sanitise_html(x_item.summary) item.link = x_item.link item.last_modified = last_modified item.dt_start = dateutil.parser.parse(x_item.xcal_dtstart) item.dt_end = dateutil.parser.parse(x_item.xcal_dtend) item.location_url = x_item.xcal_url venue_id = int(Command.DAILY_INFO_VENUE_ID_RE.match(x_item.xcal_url).groups(0)[0]) try: item.location_name, item.location_address, item.location_point = location_data[venue_id] except KeyError: try: source, id = daily_info_ids[venue_id] entity_type = iter(EntityType.objects.filter(source=source)).next() entity = Entity.objects.get(**{str(entity_type.id_field): id}) item.location_entity = entity item.location_point = entity.location item.location_name = entity.title except (KeyError, Entity.DoesNotExist): venue_et = ES.parse(urllib.urlopen(item.location_url)) item.location_name = [e for e in venue_et.findall('.//div') if e.attrib.get('class')=='heading'][0].text.strip() try: item.location_point = Point(float(x_item.geo_long), float(x_item.geo_lat)) print x_item.geo_lat, x_item.geo_long except AttributeError, ValueError: for link in venue_et.findall('.//a'): match = Command.GOOGLE_MAPS_LINK_RE.match(link.attrib.get('href', '')) if match: item.location_point = self.postcode_to_point(match.groups(0)[0]) break else: item.location_point = None for para in venue_et.findall('.//p')[1:]: item.location_address = (para.text or '').strip() item.location_address = Command.WHITESPACE_RE.sub(' ', item.location_address) if item.location_point: break match = Command.POSTCODE_RE.search(item.location_address) if not match: break item.location_point = self.postcode_to_point(match.groups(0)[0]) print item.location_point break location_data[venue_id] = item.location_name, item.location_address, item.location_point item.save() guids.add(guid) for item in items: if not item.guid in guids: item.delete()