def getStoryNumber(self, source): """ Parses HTML and returns the number of a story. """ bs = BeautifulSoup(source) span = bs.find('span', attrs={'class': 'rank'}) number = span.string.replace('.', '') return int(number)
def getStoryTitle(self, source): """ Gets the title of a story. """ bs = BeautifulSoup(source) title = bs.find('td', attrs={'class': 'title'}).text title = title.strip() return title
def getStoryDomain(self, source): """ Gets the domain of a story. """ bs = BeautifulSoup(source) url = bs.find('a').get('href') url_parsed = urlparse(url) if url_parsed.netloc: return url return urljoin('https://news.ycombinator.com', url)
def getMoreLink(self, source): soup = BeautifulSoup(source) more_a = soup.findAll("a", {"rel": "nofollow"}, text="More") if more_a: return urljoin('https://news.ycombinator.com/', more_a[0]['href']) return None
def getStories(self, source): """ Looks at source, makes stories from it, returns the stories. """ """ <td align=right valign=top class="title">31.</td> """ self.numberOfStoriesOnFrontPage = source.count('span class="rank"') # Create the empty stories. newsStories = [] for i in range(0, self.numberOfStoriesOnFrontPage): story = HackerNewsStory() newsStories.append(story) soup = BeautifulSoup(source) # Gives URLs, Domains and titles. story_details = soup.findAll("td", {"class": "title"}) # Gives score, submitter, comment count and comment URL. story_other_details = soup.findAll("td", {"class": "subtext"}) # Get story numbers. storyNumbers = [] for i in range(0, len(story_details) - 1, 2): # Otherwise, story_details[i] is a BeautifulSoup-defined object. story = str(story_details[i]) storyNumber = self.getStoryNumber(story) storyNumbers.append(storyNumber) storyURLs = [] storyDomains = [] storyTitles = [] storyScores = [] storySubmitters = [] storyCommentCounts = [] storyCommentURLs = [] storyPublishedTime = [] storyIDs = [] # Every second cell contains a story. for i in range(1, len(story_details), 2): story = str(story_details[i]) storyURLs.append(self.getStoryURL(story)) storyDomains.append(self.getStoryDomain(story)) storyTitles.append(self.getStoryTitle(story)) for s in story_other_details: story = str(s) storyScores.append(self.getStoryScore(story)) storySubmitters.append(self.getSubmitter(story)) storyCommentCounts.append(self.getCommentCount(story)) storyCommentURLs.append(self.getCommentsURL(story)) storyPublishedTime.append(self.getPublishedTime(story)) storyIDs.append(self.getHNID(story)) # Associate the values with our newsStories. for i in range(0, self.numberOfStoriesOnFrontPage): newsStories[i].number = storyNumbers[i] newsStories[i].URL = storyURLs[i] newsStories[i].domain = storyDomains[i] newsStories[i].title = storyTitles[i] newsStories[i].score = storyScores[i] newsStories[i].submitter = storySubmitters[i] newsStories[i].submitterURL = \ "https://news.ycombinator.com/user?id=" + storySubmitters[i] newsStories[i].commentCount = storyCommentCounts[i] newsStories[i].commentsURL = storyCommentURLs[i] newsStories[i].publishedTime = storyPublishedTime[i] newsStories[i].id = storyIDs[i] if newsStories[i].id < 0: newsStories[i].URL.find('item?id=') + 8 newsStories[i].commentsURL = '' newsStories[i].submitter = -1 newsStories[i].submitterURL = -1 return newsStories