def new_post(): post = Post() post.site_name = 'shavemyface' post.forum_url = forum_url post.forum_id = forum_id post.forum_name = forum_name post.topic_name = topic_name post.topic_id = topic_id return post
def extract_posts(self, soup, filename): """Extract posts""" forum = soup.find('td', 'navbar-links').find('a', {'href': re.compile('^viewforum.php\?.*')}) forum_url = forum['href'] forum_id = re.search('(\d+)', forum_url).group(1) forum_name = self.get_text(forum) topic = soup.find('td', 'content content-navbar').table.find('span', 'gen').a topic_name = topic.b.string topic_url = topic['href'] topic_id = re.search('viewtopic.php.*\Wt=(\d+)', topic_url).group(1) posts = [] messages = soup.findAll('div', 'postbody') for msg in messages: post = Post() posts.append(post) post.site_name = 'menessentials' post.forum_url = forum_url post.forum_id = forum_id post.forum_name = forum_name post.topic_name = topic_name post.topic_id = topic_id date = msg.parent.find('span', 'postdate') post.date = datetime.strptime(self.get_text(date), "Posted: %a %b %d, %Y %I:%M %p") user = msg.parent.parent.find('span', 'name').a if user: post.user_name = user.string post.user_url = user["href"] post.user_id = re.search('u=(\d+)', post.user_url).group(1) post.quote = [] for quote in reversed(msg.findAll('table', 'quote')): # Reverse, to handle nested quotes quote.extract() q = Quote() q.text = self.get_text(quote.find('td', 'quote')) quote_user = self.get_text(quote.find('td', 'quote_user')) q.user_name = re.sub('\s*wrote:$', '', quote_user) post.quote.append(q) # Now, after quotes are removed, we can extract the text of the post post.text = self.get_text(msg) # Extract links post.link = [] for link in msg.findAll('a', href=True): post.link.append(link['href']) print >>sys.stderr, ' ', len(posts), 'posts' return posts