def test_get_post_52319(self, mock_get): test_url = 'http://berlinreport.com/bbs/board.php?bo_table=lifeqna&wr_id=2329' mock_get.get(test_url, content=open('news/fixtures/flohmarkt_52319.html', 'r').read()) bp = BerlinParser(test_url) rst = bp.parse_post() self.assertTrue(rst['content'])
def get_posts(self, publisher_name=None, url=None, sleep_time=0, howmany=100, single=True): category, latest_id = get_latest(url) post_id = latest_id print 'Start at url [%s]' % url while post_id > 1: print post_id posts = Post.objects.filter(post_id=post_id, table_category=category) post_url = "http://berlinreport.com/bbs/board.php?bo_table=%s&wr_id=%d"\ % (category, post_id) post_id = post_id - 1 if posts.exists(): # print 'duplicated %s %s' % (post_url, posts[0].subject) print 'd', continue bp = BerlinParser(url=post_url) item = bp.parse_post(publisher_name) if item['subject']: item['table_category'] = category links = item.pop('links') emails = item.pop('emails') images = item.pop('images') post = Post.objects.create(**item) post.update_relates(links, emails, images) print "%s %s %s" % (post.subject, post.member, post.created_at) time.sleep(sleep_time) else: links = item.pop('links') emails = item.pop('emails') images = item.pop('images') item['subject'] = item['post_id'] post = Post.objects.create(**item) print 'skip %s' % post_url if single or (howmany < 0): break howmany = howmany - 1