def fromid(self, item_id): """ Initializes an instance of Story for given item_id. It is assumed that the story referenced by item_id is valid and does not raise any HTTP errors. item_id is an int. """ if not item_id: raise Exception('Need an item_id for a story') # get details about a particular story soup = get_item_soup(item_id) # this post has not been scraped, so we explititly get all info story_id = item_id rank = -1 info_table = soup.findChildren('table')[ 2] # to extract meta information about the post info_rows = info_table.findChildren( 'tr') # [0] = title, domain, [1] = points, user, time, comments title_row = info_rows[0].findChildren('td')[1] # title, domain title = title_row.find('a').text try: domain = title_row.find('span').string[2:-2] # domain found is_self = False link = title_row.find('a').get('href') except AttributeError: # self post domain = BASE_URL is_self = True link = '%s/item?id=%s' % (BASE_URL, item_id) meta_row = info_rows[1].findChildren( 'td')[1].contents # points, user, time, comments # [<span id="score_7024626">789 points</span>, u' by ', <a href="user?id=endianswap">endianswap</a>, # u' 8 hours ago | ', <a href="item?id=7024626">238 comments</a>] points = int( re.match(r'^(\d+)\spoint.*', meta_row[0].text).groups()[0]) submitter = meta_row[2].text submitter_profile = '%s/%s' % (BASE_URL, meta_row[2].get('href')) published_time = ' '.join(meta_row[3].strip().split()[:3]) comments_link = '%s/item?id=%s' % (BASE_URL, item_id) try: num_comments = int( re.match(r'(\d+)\s.*', meta_row[4].text).groups()[0]) except AttributeError: num_comments = 0 story = Story(rank, story_id, title, link, domain, points, submitter, published_time, submitter_profile, num_comments, comments_link, is_self) return story
def _build_story(self, all_rows): """ Builds and returns a list of stories (dicts) from the passed source. """ all_stories = [] # list to hold all stories for (info, detail) in all_rows: #-- Get the into about a story --# info_cells = info.findAll('td') # split in 3 cells rank = int(info_cells[0].string[:-1]) title = '%s' % info_cells[2].find('a').string link = info_cells[2].find('a').get('href') is_self = False # by default all stories are linking posts if link.find( 'item?id=' ) is -1: # the link doesn't contains "http" meaning an internal link domain = info_cells[2].find('span').string[ 2:-2] # slice " (abc.com) " else: link = '%s/%s' % (BASE_URL, link) domain = BASE_URL is_self = True #-- Get the into about a story --# #-- Get the detail about a story --# detail_cell = detail.findAll('td')[ 1] # split in 2 cells, we need only second detail_concern = detail_cell.contents # list of details we need, 5 count num_comments = -1 if re.match(r'^(\d+)\spoint.*', detail_concern[0].string) is not None: # can be a link or self post points = int( re.match(r'^(\d+)\spoint.*', detail_concern[0].string).groups()[0]) submitter = '%s' % detail_concern[2].string submitter_profile = '%s/%s' % (BASE_URL, detail_concern[2].get('href')) published_time = ' '.join( detail_concern[3].strip().split()[:3]) comment_tag = detail_concern[4] story_id = int( re.match(r'.*=(\d+)', comment_tag.get('href')).groups()[0]) comments_link = '%s/item?id=%d' % (BASE_URL, story_id) comment_count = re.match(r'(\d+)\s.*', comment_tag.string) try: # regex matched, cast to int num_comments = int(comment_count.groups()[0]) except AttributeError: # did not match, assign 0 num_comments = 0 else: # this is a job post points = 0 submitter = '' submitter_profile = '' published_time = '%s' % detail_concern[0] comment_tag = '' try: story_id = int(re.match(r'.*=(\d+)', link).groups()[0]) except AttributeError: story_id = -1 # job listing that points to external link comments_link = '' comment_count = -1 #-- Get the detail about a story --# story = Story(rank, story_id, title, link, domain, points, submitter, published_time, submitter_profile, num_comments, comments_link, is_self) all_stories.append(story) return all_stories