def update_time_series(stopTime, updateFrequency, timeSeries, updateQueue): ''' if queue is empty, then put all posts into it ordered by the last time their score was checked if it is too soon to refresh the score of the next post from the queue wait, else get 25 posts or make a query to reddit to get JSON data gather relevant info: utc, ups, downs, score update timeSeries entry with new score info iterate ''' IDlist = [] count = 0 page = "http://www.reddit.com/by_id/" suf = ".json" next = True #while not time to stop, query reddit for data related to score/time while time.time() <= stopTime: qEntries = len(updateQueue) if qEntries <= 0: for key, val in timeSeries.items(): heapq.heappush(updateQueue, (val[-1]['utc'], key)) #the post ID and time of last update else: time_to_update = time.time() - updateFrequency IDlist = [] while count < 25 and len(updateQueue) > 0: next = heapq.heappop(updateQueue) if time_to_update < next[0]: heapq.heappush(updateQueue, next) break IDlist.append(next[1]) count += 1 #get raw JSON data if count >= 1: raw_data = s_f.get_multiple_post_info(IDlist) #parse the JSON data #if reddit or internet failure, note that data was missed for this time period if raw_data == None: missed_time = time.time() parsed_data = {} for ID in IDlist: parsed_data[ID] = {'utc': missed_time, 'score': "N/A", 'ups': "N/A", 'downs': "N/A"} else: attributes = ['score', 'ups', 'downs'] parsed_data = s_f.parse_post_data(raw_data, attributes) #update timeSeries with parsed data for key, val in parsed_data.items(): timeSeries[key].append(val) count = 0
def get_new_posts(timeSeries, updateQueue): ''' scrape new page check post id's against those already in timeSeries (dictionary) input new posts to timeSeries and updateQueue (PriorityQueue) ''' IDList = s_f.get_post_id_list(1,"", "new", "sort", "new") NewIDs = [] if IDList == None: return None for ID in IDList: if ID not in timeSeries.keys(): NewIDs.append(ID) if len(NewIDs) != 0: new_data = s_f.get_multiple_post_info(NewIDs) if new_data == None: return None parsed_data = s_f.parse_post_data(new_data, ['created_utc']) for ID in NewIDs: utc = parsed_data[ID]['created_utc'] timeSeries[ID] = [{"utc": utc, "ups":1, "downs":0, "score":1}] heapq.heappush(updateQueue, (0, ID))