def processFeeds(self, feeds): stats.feeds_total = len(feeds) for feed in feeds: do_it_again = 0 try: do_it_again = int( dateutil.parser.parse( feed['retrieved']).strftime("%s")) + int( feed['interval']) except: do_it_again = 0 if feed['active'] == 1 and do_it_again < datetime.datetime.utcnow( ).timestamp(): ## ## Time update needs to happen at the beginning of the run to minimize ## the possibility race condition when muliple collector jobs ## are scheduled. ## self.setFeedRetrieved(feed['id'], str(datetime.datetime.utcnow())) log.info("processing feed %s" % (feed['title'])) if feed['type'] == 0: self.processRSS(feed['id'], feed['source']) elif feed['type'] == 1: self.processReddit(feed['id'], feed['source']) else: self.processGeneric(feed['id'], feed['source']) stats.feeds_processed += 1
def processRSS(self, feedid, url): d = feedparser.parse(url) for item in d['items']: log.info('retreiving %s' % item.title) self.articles.append( ArticleData(feedid, item.title, item.link, item.description, item.published))
def requestToken(self): headers = { 'Content-Type': 'application/json', 'user-agent': collector_config.USER_AGENT } url = "%s/auth" % (collector_config.API_BASE_URL) data = { 'username': collector_config.API_USERNAME, 'password': collector_config.API_PASSWORD } response = "" log.info("Requesting JWT token from %s for %s" % (url, data['username'])) try: response = requests.post(url, headers=headers, data=json.dumps(data)) response.raise_for_status() newtoken = response.json()['access_token'] log.info("Retreived JWT token from %s for %s" % (url, data['username'])) return newtoken except requests.exceptions.HTTPError as e: log.error( "Error requesting token from %s: %s %s (%s)" % (url, response.json()['status_code'], response.json()['error'], response.json()['description'])) raise Exception() return False except Exception as e: log.exception(e) raise Exception() return False
def setFeedRetrieved(self, feedid, retrieved): log.debug("Updating feed %s with last retrieved time of %s." % (feedid, retrieved)) filters = [dict(name='id', op='equals', val=feedid)] data = dict(retrieved=retrieved) print("doput(feed, %s, %s)" % (filters, data)) response = None try: response = rest.doPut("feed", filters, data) except Exception as e: log.error( "Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved)) log.exception(e) raise Exception() return None if response.json()['num_modified'] != 0: log.info("Updating feed %s with last retrieved time of %s." % (feedid, retrieved)) return True else: log.error( "Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved)) return False
def processGeneric(self, feedid, url): p = newspaper.build(url) for a in p.articles: a.download() log.info('retreiving %s' % a.title) self.articles.append( ArticleData(feedid, a.title, a.url, "", str(datetime.datetime.utcnow())))
def getFeeds(this): log.info("Retreiving list of feeds from API server.") response = "" try: response = rest.doGet("feed") return response.json()['objects'] except: log.error("Error retreiving list of feeds from API server.") raise Exception() return False
def addArticle(self, article): if self.articleExists(article) == False: log.info("adding article %s" % (article.title)) data = { "title": article.title, "link": article.link, "keywords": article.keywords, "description": article.description, "content": article.content, "contenthash": article.contenthash, "retrieved": article.retrieved, "published": article.published } try: response = rest.doPost("article", data) for word in article.keywords: keywordid = self.addKeyword(word) self.addArticleKeyword(response.json()['id'], keywordid) except Exception as e: log.exception(e) return None else: return False
def getFeedRetrieved(self, feedid): log.debug("Checking when feed %s last retreived." % (feedid)) filters = [dict(name='id', op='equals', val=feedid)] response = None try: response = rest.doGet(filters) if response.json()['num_results'] != 0: r = response.json()['objects'].pop() log.info("Feed %s last retreived %s" % (feedid, r['retrieved'])) return r['retrieved'] else: log.info("No retreived timestamp for %s" % (feedid)) return False except Exception as e: log.exception(e) raise Exception() return None
def setFeedRetrieved(self, feedid, retrieved): log.debug("Updating feed %s with last retrieved time of %s." % (feedid, retrieved)) filters = [dict(name='id', op='equals', val=feedid)] data = dict(retrieved=retrieved) print("doput(feed, %s, %s)" % (filters, data)) response = None try: response = rest.doPut("feed", filters, data) except Exception as e: log.error("Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved)) log.exception(e) raise Exception() return None if response.json()['num_modified'] != 0: log.info("Updating feed %s with last retrieved time of %s." % (feedid, retrieved)) return True else: log.error("Failed to updaste feed %s with last retrieved time of %s." % (feedid, retrieved)) return False
def requestToken(self): headers = {"Content-Type": "application/json", "user-agent": collector_config.USER_AGENT} url = "%s/auth" % (collector_config.API_BASE_URL) data = {"username": collector_config.API_USERNAME, "password": collector_config.API_PASSWORD} response = "" log.info("Requesting JWT token from %s for %s" % (url, data["username"])) try: response = requests.post(url, headers=headers, data=json.dumps(data)) response.raise_for_status() newtoken = response.json()["access_token"] log.info("Retreived JWT token from %s for %s" % (url, data["username"])) return newtoken except requests.exceptions.HTTPError as e: log.error( "Error requesting token from %s: %s %s (%s)" % (url, response.json()["status_code"], response.json()["error"], response.json()["description"]) ) raise Exception() return False except Exception as e: log.exception(e) raise Exception() return False
def processFeeds(self, feeds): stats.feeds_total=len(feeds) for feed in feeds: do_it_again = 0 try: do_it_again = int(dateutil.parser.parse(feed['retrieved']).strftime("%s")) + int(feed['interval']) except: do_it_again = 0 if feed['active'] == 1 and do_it_again < datetime.datetime.utcnow().timestamp(): ## ## Time update needs to happen at the beginning of the run to minimize ## the possibility race condition when muliple collector jobs ## are scheduled. ## self.setFeedRetrieved(feed['id'], str(datetime.datetime.utcnow())) log.info("processing feed %s" % (feed['title'])) if feed['type'] == 0: self.processRSS(feed['id'], feed['source']) elif feed['type'] == 1: self.processReddit(feed['id'], feed['source']) else: self.processGeneric(feed['id'], feed['source']) stats.feeds_processed += 1
def processReddit(self, feedid, subreddit): r = praw.Reddit(collector_config.USER_AGENT) sr = r.get_subreddit(subreddit) for item in sr.get_new(): log.info('retreiving %s' % item.title) self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created)))) for item in sr.get_hot(): log.info('retreiving %s' % item.title) self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created)))) for item in sr.get_top(): log.info('retreiving %s' % item.title) self.articles.append(ArticleData(feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created))))
def processReddit(self, feedid, subreddit): r = praw.Reddit(collector_config.USER_AGENT) sr = r.get_subreddit(subreddit) for item in sr.get_new(): log.info('retreiving %s' % item.title) self.articles.append( ArticleData( feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created)))) for item in sr.get_hot(): log.info('retreiving %s' % item.title) self.articles.append( ArticleData( feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created)))) for item in sr.get_top(): log.info('retreiving %s' % item.title) self.articles.append( ArticleData( feedid, item.title, item.url, item.selftext, str(datetime.datetime.utcfromtimestamp(item.created))))
def initFeeds(this): log.info('Reading feeds file %s' % (collector_config.FEEDS_FILE)) with open(collector_config.FEEDS_FILE) as feeds_file: feeds = json.load(feeds_file) for feed in feeds: this.addFeed(feed['title'], feed['source'], "", feed['type'], feed['interval'], feed['active'])
def processGeneric(self, feedid, url): p = newspaper.build(url) for a in p.articles: a.download() log.info('retreiving %s' % a.title) self.articles.append(ArticleData(feedid, a.title, a.url, "", str(datetime.datetime.utcnow())))
def processRSS(self, feedid, url): d = feedparser.parse(url) for item in d['items']: log.info('retreiving %s' % item.title) self.articles.append(ArticleData(feedid, item.title, item.link, item.description, item.published))