Пример #1
0
    def fromid(self, item_id):
        """
        Initializes an instance of Story for given item_id.
        It is assumed that the story referenced by item_id is valid
        and does not raise any HTTP errors.
        item_id is an int.
        """
        if not item_id:
            raise Exception('Need an item_id for a story')
        # get details about a particular story
        soup = get_item_soup(item_id)

        # this post has not been scraped, so we explititly get all info
        story_id = item_id
        rank = -1

        info_table = soup.findChildren('table')[
            2]  # to extract meta information about the post
        info_rows = info_table.findChildren(
            'tr')  # [0] = title, domain, [1] = points, user, time, comments

        title_row = info_rows[0].findChildren('td')[1]  # title, domain
        title = title_row.find('a').text
        try:
            domain = title_row.find('span').string[2:-2]
            # domain found
            is_self = False
            link = title_row.find('a').get('href')
        except AttributeError:
            # self post
            domain = BASE_URL
            is_self = True
            link = '%s/item?id=%s' % (BASE_URL, item_id)

        meta_row = info_rows[1].findChildren(
            'td')[1].contents  # points, user, time, comments
        # [<span id="score_7024626">789 points</span>, u' by ', <a href="user?id=endianswap">endianswap</a>,
        # u' 8 hours ago  | ', <a href="item?id=7024626">238 comments</a>]

        points = int(
            re.match(r'^(\d+)\spoint.*', meta_row[0].text).groups()[0])
        submitter = meta_row[2].text
        submitter_profile = '%s/%s' % (BASE_URL, meta_row[2].get('href'))
        published_time = ' '.join(meta_row[3].strip().split()[:3])
        comments_link = '%s/item?id=%s' % (BASE_URL, item_id)
        try:
            num_comments = int(
                re.match(r'(\d+)\s.*', meta_row[4].text).groups()[0])
        except AttributeError:
            num_comments = 0
        story = Story(rank, story_id, title, link, domain, points, submitter,
                      published_time, submitter_profile, num_comments,
                      comments_link, is_self)
        return story
Пример #2
0
    def _build_story(self, all_rows):
        """
        Builds and returns a list of stories (dicts) from the passed source.
        """
        all_stories = []  # list to hold all stories

        for (info, detail) in all_rows:

            #-- Get the into about a story --#
            info_cells = info.findAll('td')  # split in 3 cells

            rank = int(info_cells[0].string[:-1])
            title = '%s' % info_cells[2].find('a').string
            link = info_cells[2].find('a').get('href')

            is_self = False  # by default all stories are linking posts

            if link.find(
                    'item?id='
            ) is -1:  # the link doesn't contains "http" meaning an internal link
                domain = info_cells[2].find('span').string[
                    2:-2]  # slice " (abc.com) "
            else:
                link = '%s/%s' % (BASE_URL, link)
                domain = BASE_URL
                is_self = True
            #-- Get the into about a story --#

            #-- Get the detail about a story --#
            detail_cell = detail.findAll('td')[
                1]  # split in 2 cells, we need only second
            detail_concern = detail_cell.contents  # list of details we need, 5 count

            num_comments = -1

            if re.match(r'^(\d+)\spoint.*',
                        detail_concern[0].string) is not None:
                # can be a link or self post
                points = int(
                    re.match(r'^(\d+)\spoint.*',
                             detail_concern[0].string).groups()[0])
                submitter = '%s' % detail_concern[2].string
                submitter_profile = '%s/%s' % (BASE_URL,
                                               detail_concern[2].get('href'))
                published_time = ' '.join(
                    detail_concern[3].strip().split()[:3])
                comment_tag = detail_concern[4]
                story_id = int(
                    re.match(r'.*=(\d+)', comment_tag.get('href')).groups()[0])
                comments_link = '%s/item?id=%d' % (BASE_URL, story_id)
                comment_count = re.match(r'(\d+)\s.*', comment_tag.string)
                try:
                    # regex matched, cast to int
                    num_comments = int(comment_count.groups()[0])
                except AttributeError:
                    # did not match, assign 0
                    num_comments = 0
            else:  # this is a job post
                points = 0
                submitter = ''
                submitter_profile = ''
                published_time = '%s' % detail_concern[0]
                comment_tag = ''
                try:
                    story_id = int(re.match(r'.*=(\d+)', link).groups()[0])
                except AttributeError:
                    story_id = -1  # job listing that points to external link
                comments_link = ''
                comment_count = -1
            #-- Get the detail about a story --#

            story = Story(rank, story_id, title, link, domain, points,
                          submitter, published_time, submitter_profile,
                          num_comments, comments_link, is_self)

            all_stories.append(story)

        return all_stories