예제 #1
0
if sourceQ == False:
    log.plog("error finding story source", 4)
    sys.exit()
source = sourceQ.fetch_row(1, 1)
if source == ():
    log.plog("error finding story source", 4)
    sys.exit()
infoModule.info.source = source[0]
for i in infoModule.info.source.keys():
    ## this is sort of hack-y, but stupid python returns None for null
    if infoModule.info.source[i] == None:
        infoModule.info.source[i] = ''

url = source[0]['feed_url']
log.plog('fetching feed ' + url, 1)
obj = feedfetcher.getFeed(url)
if obj == False or obj['type'] == None:
    log.plog("unreadable feed ", 5)
    sys._exit(0)
else:
    if obj['type'][0:3] == 'rss':
        log.plog('feed is rss', 1)
    elif obj['type'][0:4] == 'atom':
        log.plog('feed is atom', 1)
    else:
        log.plog("can't tell feed type! " + obj['type'], 5)
        sys._exit(0)

ctr = 0
for entry in obj['data']['entries']:
    if ctr > 0:
예제 #2
0
source = sourceQ.fetch_row(1,1)
if source == ():
    log.plog("error finding story source", 4)
    sys.exit()
infoModule.info.source = source[0]
for i in infoModule.info.source.keys():
    ## this is sort of hack-y, but stupid python returns None for null
    if infoModule.info.source[i] == None:
        infoModule.info.source[i] = ''

    


url = source[0]['feed_url']
log.plog('fetching feed ' + url, 1)
obj = feedfetcher.getFeed(url)
if obj == False or obj['type'] == None:
    log.plog("unreadable feed ", 5)
    sys._exit(0)
else:
    if obj['type'][0:3] == 'rss':
        log.plog('feed is rss', 1)
    elif obj['type'][0:4] == 'atom':
        log.plog('feed is atom', 1)
    else:
        log.plog("can't tell feed type! " + obj['type'], 5)
        sys._exit(0)


ctr = 0
for entry in obj['data']['entries']:
예제 #3
0
def feedReader():
    storiesAdded = 0
    while storiesAdded == 0:
        #keep trying till something is added
        if len(sys.argv) > 3:
            nextFeedQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where source_id=" + sys.argv[3] + " limit 1", infoModule.info.site['dblink'])
            #test mode, so don't loop
            storiesAdded = 1
        else:
            maxPriorityQ = mysql_tools.mysqlQuery("select max(priority) as mp from " + siteDB + ".sources", link)
            maxPriority = maxPriorityQ.fetch_row(1,1)

            mp = int(maxPriority[0]['mp'])

            maxFeed = mp
            randRange = 10
            randRangeOffset = 10
            randStep = 2

            for i in range(1, maxFeed+1):
                randRange = randRange + randRangeOffset + (randStep * i)
            
            foundFeed = False
            while foundFeed == False:
                picker = random.randint(1,randRange)
                for i in range(maxFeed,0,-1):
                    hitBlock = randRangeOffset + randStep + (randStep * i)
                    if picker < hitBlock:
                        priority = i
                        break
                    else:
                        picker = picker - hitBlock


                log.plog('feed priority set to ' + str(priority), 1)
                #loop until I get a feed
                log.plog("select * from " + siteDB + ".sources where (source_format = 'rss' or source_format='atom') and priority = " + str(priority) + " order by last_update limit 1", 2)
                nextFeedQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where (source_format = 'rss' or source_format='atom') and priority = " + str(priority) + " order by last_update limit 1", infoModule.info.site['dblink'])
                if nextFeedQ.num_rows() > 0:
                    foundFeed = True
        
        if nextFeedQ == False:
            #print "rss selection query failed"
            sys.exit()
    
        while(1):
            row = nextFeedQ.fetch_row(1,1)
            if row==():
                break
            
            sql = "update " + siteDB + ".sources set last_update=now() where source_id=" + row[0]['source_id']
            mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])            
            log.plog(sql, 2)

            infoModule.info.source = row[0]
            url = row[0]['feed_url']
            log.plog('fetching feed ' + url, 1)
            obj = feedfetcher.getFeed(url)
            if obj == False or obj['type'] == None:
                log.plog("unreadable feed ", 5)
                failureQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".feedFailures where source_id=" + row[0]['source_id'],  infoModule.info.site['dblink']) 
                if failureQ.num_rows() > 0:
                    mysql_tools.mysqlQuery("update " + siteDB + ".feedFailures set failedOn = now(), message='unreadable feed' where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) 
                else:
                    mysql_tools.mysqlQuery("insert into " + siteDB + ".feedFailures set source_id=" + row[0]['source_id'] + ", failedOn = now(), message='unreadable feed'", infoModule.info.site['dblink']) 
                continue
            else:
                if obj['type'][0:3] == 'rss':
                    log.plog('feed is rss', 1)
                elif obj['type'][0:4] == 'atom':
                    log.plog('feed is atom', 1)
                else:
                    log.plog("can't tell feed type! " + obj['type'], 5)
                    failureQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".feedFailures where source_id=" + row[0]['source_id'],  infoModule.info.site['dblink']) 
                    if failureQ.num_rows() > 0:
                        mysql_tools.mysqlQuery("update " + siteDB + ".feedFailures set failedOn = now(), message='can\\'t tell feed type' where source_id=" + row[0]['source_id'], infoModule.info.site['dblink']) 
                    else:
                        mysql_tools.mysqlQuery("insert into " + siteDB + ".feedFailures set source_id=" + row[0]['source_id'] + ", failedOn = now(), message='can\\'t tell feed type'", infoModule.info.site['dblink']) 
                    continue
    
                #test header for last published
                pubDateHeader = ''
                if 'last-modified' in obj['data']['headers']:
                    pubDateHeader = obj['data']['headers']['last-modified']
                    log.plog('pubDate from header: ' + pubDateHeader, 1)
    
                #pprint.pprint(obj['data'])
    
                pubDateEntry = ''
                if len(obj['data']['entries']) > 0 and 'updated' in obj['data']['entries'][0]:
                    pubDateEntry = obj['data']['entries'][0]['updated']
                    log.plog('pubDate from first story ' + pubDateEntry, 1)
                
                
                if pubDateEntry != '':
                    pubDate = pubDateEntry
                elif pubDateHeader != '':
                    pubDate = pubDateHeader
                else:
                    pubDate = ''
                    log.plog('could not determine pubdate for feed', 4)
                
                if row[0]['publish_date'] == pubDate:
                    log.plog('publish date unchanged.  not reading further', 1)
                    continue
                
                for entry in obj['data']['entries']:
                    #no stories older than 7 days
                    if 'updated_parsed' in entry and entry['updated_parsed'] != None:
                        now = time.localtime()
                        nowSecs = time.mktime(now)
                        updatedSecs = time.mktime(entry['updated_parsed'])
                        diff = nowSecs - updatedSecs
                        if diff > 7 * 24 * 3600:
                            log.plog('story is too old, not reading further', 3)
                            break

                    #check against sourceREaderQueue
                    escLink = entry['link'].replace("'", "\\'")
                    sql = "select * from " + siteDB + ".sourceReaderQueue where url='" + escLink + "'"
                    queueCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                    if queueCheckQ.num_rows() > 0:
                        log.plog("story in queue.  Not reading further from this feed", 2)
                        break
                
          
                    #check guid against db
                    if 'id' not in entry:
                        entry['id'] = entry['link']
                    escId = entry['id'].replace("'", "\\'")
                    sql = "select * from " + siteDB + ".feedGuids where guid='" + escId + "'"
                    guidCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                    pushToQueue = True
                    if guidCheckQ.num_rows() > 0:
                        pushToQueue = False
                        log.plog('story GUID in database, not reading further from this feed', 2)
                        break
                    
                    sql = "select imported from " + siteDB + ".feedGuids where guid='" + entry['link'] + "'"
                    guidCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                    if guidCheckQ.num_rows() > 0:
                        pushToQueue = False
                        log.plog('feed story url (%s) in guid db, skipping' % entry['link'], 2)

                    sql = "select sub_id from " + siteDB + ".newsroom where url='" + entry['link'] + "'"
                    newsroomCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                    if newsroomCheckQ.num_rows() > 0:
                        pushToQueue = False
                        log.plog('story url in newsroom, skipping', 2)

                    sql = "select sub_id from " + siteDB + ".subs where url='" + entry['link'] + "'"
                    subsCheckQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                    if subsCheckQ.num_rows() > 0:
                        pushToQueue = False
                        log.plog('story url in subs, skipping', 2)


                    if pushToQueue == True:
                        log.plog('no matching guid found. continue with import', 1)
                        entry['title'] = entry['title'].encode('ascii', 'ignore')
                        entry['title'] = entry['title'].replace("'", "\\'")
                        if 'summary' not in entry:
                            entry['summary'] = ''
                        entry['summary'] = entry['summary'].encode('ascii', 'ignore')
                        entry['summary'] = entry['summary'].replace("'", "\\'")
                        #since two hits are required to bring in a story, all feed stories are given the go_live flag
                        sql = "insert into " + siteDB + ".sourceReaderQueue set added=now(), url='" + entry['link'] + "', source_id=" + row[0]['source_id'] + ",publishDate=now(), potentialTitle='" + entry['title'] + "', potentialOutline='" + entry['summary'] + "', promoter=0, submittingURL='', go_live = 1"
                        log.plog("&& dispatch says: " + sql, 2)
                        mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
                        storiesAdded += 1
                        #add url to feedGuids and then test against it later
                        sql = "insert into " + siteDB + ".feedGuids set guid='" + entry['link'] + "', imported=now()"
                        guidSet = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])