Python get_multiple_post_info示例

编程语言: Python

命名空间/包名称: scraper_functions

方法/功能: get_multiple_post_info

hotexamples.com的示例: 2

Python get_multiple_post_info - 已找到2个示例。这些是从开源项目中提取的最受好评的scraper_functions.get_multiple_post_info现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def update_time_series(stopTime, updateFrequency, timeSeries, updateQueue):
    '''
    if queue is empty, then put all posts into it ordered by the last time their score was checked
    if it is too soon to refresh the score of the next post from the queue wait, else get 25 posts or 
    make a query to reddit to get JSON data
    gather relevant info: utc, ups, downs, score
    update timeSeries entry with new score info
    iterate
    '''
    IDlist = []
    count = 0
    page = "http://www.reddit.com/by_id/"
    suf = ".json"
    next = True
    #while not time to stop, query reddit for data related to score/time 
    while time.time() <= stopTime:
        qEntries = len(updateQueue)

        if qEntries <= 0:
            for key, val in timeSeries.items():
                heapq.heappush(updateQueue, (val[-1]['utc'], key))  #the post ID and time of last update
        else:
            time_to_update = time.time() - updateFrequency
            IDlist = []
            while count < 25 and len(updateQueue) > 0:
                next = heapq.heappop(updateQueue)
                if time_to_update < next[0]:
                    heapq.heappush(updateQueue, next)
                    break
                IDlist.append(next[1])
                count += 1
        #get raw JSON data
        if count >= 1:
            raw_data = s_f.get_multiple_post_info(IDlist)     
            #parse the JSON data
                #if reddit or internet failure, note that data was missed for this time period
            if raw_data == None:
                missed_time = time.time()
                parsed_data = {}
                for ID in IDlist:
                    parsed_data[ID] = {'utc': missed_time, 'score': "N/A", 'ups': "N/A", 'downs': "N/A"}
            else:
                attributes = ['score', 'ups', 'downs']
                parsed_data = s_f.parse_post_data(raw_data, attributes)
            #update timeSeries with parsed data
            for key, val in parsed_data.items():
                timeSeries[key].append(val)         
        count = 0

示例#2

显示文件

def get_new_posts(timeSeries, updateQueue):
    '''
    scrape new page
    check post id's against those already in timeSeries (dictionary)
    input new posts to timeSeries and updateQueue (PriorityQueue)
    '''
    IDList = s_f.get_post_id_list(1,"", "new", "sort", "new")
    NewIDs = []
    if IDList == None:
        return None
    for ID in IDList:
        if ID not in timeSeries.keys():
            NewIDs.append(ID)
    if len(NewIDs) != 0:
        new_data  = s_f.get_multiple_post_info(NewIDs)
        if new_data == None:
            return None
        parsed_data = s_f.parse_post_data(new_data, ['created_utc'])
        for ID in NewIDs:  
            utc = parsed_data[ID]['created_utc']
            timeSeries[ID] = [{"utc": utc, "ups":1, "downs":0, "score":1}]
            heapq.heappush(updateQueue, (0, ID))