def load_feed(self, store_data=True): working_url = self._get_working_url('feed') #print 'loading feed from: %s' % working_url # try: f = urllib2.urlopen(working_url) #print f.info() data = f.read() # print data # try to decode the json string json_obj = json.loads(data) # print '\n\n--------------------\n%s' % json_obj['data'][0] if store_data: # store the data in the raw_data model new_tag = DataTag.objects.get(name='new') new_datas = [] unprocessed_datas = RawData.objects.filter(source=self.source_node, tags__in=[new_tag]) if unprocessed_datas: new_datas.extend(unprocessed_datas) for data in json_obj['data']: fblinktype = data.get('type',None) new_data = RawData() title = data.get('name',data['id']) created_at = data.get('created_time',str(datetime.datetime.now())) if fblinktype: if fblinktype == "link" and data.get('description',None): title = data.get('description') elif data.get('message',None): title = data.get('message') if len(title) > 100: title = title[:100] + "..." new_data.title = title new_data.data_id = data['id'] new_data.source = self.data_src new_data.data = json.dumps(data) new_data.link = data.get('link',None) # try and parse the date try: dt = parser.parse(created_at) except ValueError: dt = datetime.datetime.now() new_data.occurred_at = dt # make sure that the raw data does not exist if not new_data.exists(): new_data.save() new_data.tags.add(new_tag) new_data.save() new_datas.append(new_data) if new_datas: fba = FacebookAgent() fba.search(raw_data_set = new_datas)
def fetch_data(self): """ Override fetch data of parent Uses the feedparser library to fetch the contents of an RSS feed from a given url. The entries in the result are then used to create a new RawData object.""" data = feedparser.parse(self.url) # print data # add data to 'RawData table' for entry in data.entries: rd = RawData() rd.title = entry.title rd.data = entry rd.source = self.source_node rd.save() return data
def load(self, store_data=True): if not self.article_css_selector or not self.article_link_selector: # print "No CSS selector information supplied, cannot load" return None article_links = SiteLinkLoader.get_elements_by_css(self.url, self.article_link_selector) new_tags = DataTag.objects.filter(name='new') new_tag = DataTag.objects.get(name='new') # get all the new data objects new_datas = [] unprocessed_datas = RawData.objects.filter(source=self.source_node, tags__in=new_tags) if unprocessed_datas: new_datas.extend(unprocessed_datas) if article_links: for article_link in article_links: article_title = strip_tags(article_link.html()) article_url = article_link.href if article_url.startswith('/'): article_url = article_url[1:] # get rid of the starting slash if not article_url.lower().startswith('http://') and not article_url.lower().startswith(self.hostname): article_url = 'http://%s/%s' % (self.hostname, article_url) article_content = "" article_pieces = SiteLinkLoader.get_elements_by_css(article_url, self.article_css_selector) for article_piece in article_pieces: article_content += strip_tags(article_piece.html()) # create a new raw data if none exists similar_raw_datas = RawData.objects.filter(title=article_title) if not similar_raw_datas: # print " + Saving article: %s" % article_title new_data = RawData() new_data.title = article_title new_data.data = article_content new_data.link = article_url new_data.source = self.source_node new_data.data_id = article_url if store_data: # save it new_data.save() new_data.tags.add(new_tag) new_data.save() new_datas.append(new_data) if new_datas: ba = BasicAgent() ba.search(raw_data_set=new_datas)
def load(self, store_data = True, date_limit=None, run_agent=False): for data_src in self.data_sources: print "Loading data from: %s" % data_src # init variables from the data source url = data_src.src_id source_node = data_src parameters = data_src.get_parameters() username = parameters.get('username','*****@*****.**') psw = parameters.get('password','choirpassword') article_css_selector = parameters.get('article-css-selector','') fetch_limit = parameters.get('fetch-limit',None) auth = ClientAuthMethod(username,psw) reader = GoogleReader(auth) if reader.buildSubscriptionList(): feeds = reader.getSubscriptionList() new_tag = DataTag.objects.get(name='new') new_datas = [] fetch_count = 0 # loop through and store feeds we already have RawData for for feed in feeds: if not fetch_limit: fetch_limit = feed.unread read_items = [] print "Reading " + feed.title + " (%s unread)" % feed.unread print "====================================================" print print "Loading items" print feed.loadItems() print "Loaded %s items" % (len(feed.items),) print index = 0 for item in feed.items: # make sure it doesn't already exist title = item.title url = item.url index+=1 if index + 1 >= len(feed.items) and fetch_count < fetch_limit: print "Loading more items...." print feed.loadMoreItems() f = urllib.urlopen(url) html = f.read() doc = leaf.parse(html) elements = doc(article_css_selector) for element in elements: # print article_html = element.html() new_data = RawData() new_data.title = title new_data.source = source_node new_data.data = strip_tags(article_html) new_data.data_id = item.id new_data.link = item.url try: new_data.occurred_at = datetime.datetime.fromtimestamp(feed.lastUpdated) except ValueError: # print "Error, could not parse timestamp: %s" % feed.lastUpdated new_data.occurred_at = datetime.datetime.now() # patching in date limit thing Parris wanted -------------------------- # if date_limit is None: # date_limit = datetime.date.today() - datetime.timedelta(week=1) # # if new_data.occured_at < date_limit: # # we should skip this item .... it is too old # continue # # end patch ----------------------------------------------------------- # Abandonning this idea for now ... I think it's best to patch the map view and not mess with this for now # if it is not new... save it if not new_data.exists(): print " + Saving article: %s" % new_data.title new_data.save() new_data.tags.add(new_tag) new_datas.append(new_data) fetch_count +=1 read_items.append(item) # print "All done.\n %s items fetched, our limit is %s. There are %s feeds. We stopped at index %s" % (fetch_count, self.fetch_limit, len(feed.items),index) if new_datas and run_agent: gra = GoogleReaderAgent() gra.search(raw_data_set = new_datas) return new_datas return None