예제 #1
0
def update_time_series(stopTime, updateFrequency, timeSeries, updateQueue):
    '''
    if queue is empty, then put all posts into it ordered by the last time their score was checked
    if it is too soon to refresh the score of the next post from the queue wait, else get 25 posts or 
    make a query to reddit to get JSON data
    gather relevant info: utc, ups, downs, score
    update timeSeries entry with new score info
    iterate
    '''
    IDlist = []
    count = 0
    page = "http://www.reddit.com/by_id/"
    suf = ".json"
    next = True
    #while not time to stop, query reddit for data related to score/time 
    while time.time() <= stopTime:
        qEntries = len(updateQueue)

        if qEntries <= 0:
            for key, val in timeSeries.items():
                heapq.heappush(updateQueue, (val[-1]['utc'], key))  #the post ID and time of last update
        else:
            time_to_update = time.time() - updateFrequency
            IDlist = []
            while count < 25 and len(updateQueue) > 0:
                next = heapq.heappop(updateQueue)
                if time_to_update < next[0]:
                    heapq.heappush(updateQueue, next)
                    break
                IDlist.append(next[1])
                count += 1
        #get raw JSON data
        if count >= 1:
            raw_data = s_f.get_multiple_post_info(IDlist)     
            #parse the JSON data
                #if reddit or internet failure, note that data was missed for this time period
            if raw_data == None:
                missed_time = time.time()
                parsed_data = {}
                for ID in IDlist:
                    parsed_data[ID] = {'utc': missed_time, 'score': "N/A", 'ups': "N/A", 'downs': "N/A"}
            else:
                attributes = ['score', 'ups', 'downs']
                parsed_data = s_f.parse_post_data(raw_data, attributes)
            #update timeSeries with parsed data
            for key, val in parsed_data.items():
                timeSeries[key].append(val)         
        count = 0
예제 #2
0
def get_new_posts(timeSeries, updateQueue):
    '''
    scrape new page
    check post id's against those already in timeSeries (dictionary)
    input new posts to timeSeries and updateQueue (PriorityQueue)
    '''
    IDList = s_f.get_post_id_list(1,"", "new", "sort", "new")
    NewIDs = []
    if IDList == None:
        return None
    for ID in IDList:
        if ID not in timeSeries.keys():
            NewIDs.append(ID)
    if len(NewIDs) != 0:
        new_data  = s_f.get_multiple_post_info(NewIDs)
        if new_data == None:
            return None
        parsed_data = s_f.parse_post_data(new_data, ['created_utc'])
        for ID in NewIDs:  
            utc = parsed_data[ID]['created_utc']
            timeSeries[ID] = [{"utc": utc, "ups":1, "downs":0, "score":1}]
            heapq.heappush(updateQueue, (0, ID))