if sourceQ == False: log.plog("error finding story source", 4) sys.exit() source = sourceQ.fetch_row(1, 1) if source == (): log.plog("error finding story source", 4) sys.exit() infoModule.info.source = source[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' url = source[0]['feed_url'] log.plog('fetching feed ' + url, 1) obj = feedfetcher.getFeed(url) if obj == False or obj['type'] == None: log.plog("unreadable feed ", 5) sys._exit(0) else: if obj['type'][0:3] == 'rss': log.plog('feed is rss', 1) elif obj['type'][0:4] == 'atom': log.plog('feed is atom', 1) else: log.plog("can't tell feed type! " + obj['type'], 5) sys._exit(0) ctr = 0 for entry in obj['data']['entries']: if ctr > 0:
source = sourceQ.fetch_row(1,1) if source == (): log.plog("error finding story source", 4) sys.exit() infoModule.info.source = source[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' url = source[0]['feed_url'] log.plog('fetching feed ' + url, 1) obj = feedfetcher.getFeed(url) if obj == False or obj['type'] == None: log.plog("unreadable feed ", 5) sys._exit(0) else: if obj['type'][0:3] == 'rss': log.plog('feed is rss', 1) elif obj['type'][0:4] == 'atom': log.plog('feed is atom', 1) else: log.plog("can't tell feed type! " + obj['type'], 5) sys._exit(0) ctr = 0 for entry in obj['data']['entries']:
def feedReader(): storiesAdded = 0 while storiesAdded == 0: #keep trying till something is added if len(sys.argv) > 3: nextFeedQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where source_id=" + sys.argv[3] + " limit 1", infoModule.info.site['dblink']) #test mode, so don't loop storiesAdded = 1 else: maxPriorityQ = mysql_tools.mysqlQuery("select max(priority) as mp from " + siteDB + ".sources", link) maxPriority = maxPriorityQ.fetch_row(1,1) mp = int(maxPriority[0]['mp']) maxFeed = mp randRange = 10 randRangeOffset = 10 randStep = 2 for i in range(1, maxFeed+1): randRange = randRange + randRangeOffset + (randStep * i) foundFeed = False while foundFeed == False: picker = random.randint(1,randRange) for i in range(maxFeed,0,-1): hitBlock = randRangeOffset + randStep + (randStep * i) if picker < hitBlock: priority = i break else: picker = picker - hitBlock log.plog('feed priority set to ' + str(priority), 1) #loop until I get a feed log.plog("select * from " + siteDB + ".sources where (source_format = 'rss' or source_format='atom') and priority = " + str(priority) + " order by last_update limit 1", 2) nextFeedQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where (source_format = 'rss' or source_format='atom') and priority = " + str(priority) + " order by last_update limit 1", infoModule.info.site['dblink']) if nextFeedQ.num_rows() > 0: foundFeed = True if nextFeedQ == False: #print "rss selection query failed" sys.exit() while(1): row = nextFeedQ.fetch_row(1,1) if row==(): break sql = "update " + siteDB + ".sources set last_update=now() where source_id=" + row[0]['source_id'] mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) log.plog(sql, 2) infoModule.info.source = row[0] url = row[0]['feed_url'] log.plog('fetching feed ' + url, 1) obj = feedfetcher.getFeed(url) if obj == False or obj['type'] == None: log.plog("unreadable feed ", 5) failureQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".feedFailures where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) if failureQ.num_rows() > 0: mysql_tools.mysqlQuery("update " + siteDB + ".feedFailures set failedOn = now(), message='unreadable feed' where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) else: mysql_tools.mysqlQuery("insert into " + siteDB + ".feedFailures set source_id=" + row[0]['source_id'] + ", failedOn = now(), message='unreadable feed'", infoModule.info.site['dblink']) continue else: if obj['type'][0:3] == 'rss': log.plog('feed is rss', 1) elif obj['type'][0:4] == 'atom': log.plog('feed is atom', 1) else: log.plog("can't tell feed type! " + obj['type'], 5) failureQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".feedFailures where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) if failureQ.num_rows() > 0: mysql_tools.mysqlQuery("update " + siteDB + ".feedFailures set failedOn = now(), message='can\\'t tell feed type' where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) else: mysql_tools.mysqlQuery("insert into " + siteDB + ".feedFailures set source_id=" + row[0]['source_id'] + ", failedOn = now(), message='can\\'t tell feed type'", infoModule.info.site['dblink']) continue #test header for last published pubDateHeader = '' if 'last-modified' in obj['data']['headers']: pubDateHeader = obj['data']['headers']['last-modified'] log.plog('pubDate from header: ' + pubDateHeader, 1) #pprint.pprint(obj['data']) pubDateEntry = '' if len(obj['data']['entries']) > 0 and 'updated' in obj['data']['entries'][0]: pubDateEntry = obj['data']['entries'][0]['updated'] log.plog('pubDate from first story ' + pubDateEntry, 1) if pubDateEntry != '': pubDate = pubDateEntry elif pubDateHeader != '': pubDate = pubDateHeader else: pubDate = '' log.plog('could not determine pubdate for feed', 4) if row[0]['publish_date'] == pubDate: log.plog('publish date unchanged. not reading further', 1) continue for entry in obj['data']['entries']: #no stories older than 7 days if 'updated_parsed' in entry and entry['updated_parsed'] != None: now = time.localtime() nowSecs = time.mktime(now) updatedSecs = time.mktime(entry['updated_parsed']) diff = nowSecs - updatedSecs if diff > 7 * 24 * 3600: log.plog('story is too old, not reading further', 3) break #check against sourceREaderQueue escLink = entry['link'].replace("'", "\\'") sql = "select * from " + siteDB + ".sourceReaderQueue where url='" + escLink + "'" queueCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if queueCheckQ.num_rows() > 0: log.plog("story in queue. Not reading further from this feed", 2) break #check guid against db if 'id' not in entry: entry['id'] = entry['link'] escId = entry['id'].replace("'", "\\'") sql = "select * from " + siteDB + ".feedGuids where guid='" + escId + "'" guidCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) pushToQueue = True if guidCheckQ.num_rows() > 0: pushToQueue = False log.plog('story GUID in database, not reading further from this feed', 2) break sql = "select imported from " + siteDB + ".feedGuids where guid='" + entry['link'] + "'" guidCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if guidCheckQ.num_rows() > 0: pushToQueue = False log.plog('feed story url (%s) in guid db, skipping' % entry['link'], 2) sql = "select sub_id from " + siteDB + ".newsroom where url='" + entry['link'] + "'" newsroomCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if newsroomCheckQ.num_rows() > 0: pushToQueue = False log.plog('story url in newsroom, skipping', 2) sql = "select sub_id from " + siteDB + ".subs where url='" + entry['link'] + "'" subsCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if subsCheckQ.num_rows() > 0: pushToQueue = False log.plog('story url in subs, skipping', 2) if pushToQueue == True: log.plog('no matching guid found. continue with import', 1) entry['title'] = entry['title'].encode('ascii', 'ignore') entry['title'] = entry['title'].replace("'", "\\'") if 'summary' not in entry: entry['summary'] = '' entry['summary'] = entry['summary'].encode('ascii', 'ignore') entry['summary'] = entry['summary'].replace("'", "\\'") #since two hits are required to bring in a story, all feed stories are given the go_live flag sql = "insert into " + siteDB + ".sourceReaderQueue set added=now(), url='" + entry['link'] + "', source_id=" + row[0]['source_id'] + ",publishDate=now(), potentialTitle='" + entry['title'] + "', potentialOutline='" + entry['summary'] + "', promoter=0, submittingURL='', go_live = 1" log.plog("&& dispatch says: " + sql, 2) mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) storiesAdded += 1 #add url to feedGuids and then test against it later sql = "insert into " + siteDB + ".feedGuids set guid='" + entry['link'] + "', imported=now()" guidSet = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])