예제 #1
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
     if is_python2 is False:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                          auto_make_dirs=True)
     else:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                          auto_make_dirs=True)
     self.redditreader = RedditReader(reddit_fname)
class RedditReaderTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
        if PY2 is False:
            write_json_lines(REDDIT_COMMENTS,
                             reddit_fname,
                             mode='wt',
                             auto_make_dirs=True)
        else:
            write_json_lines(REDDIT_COMMENTS,
                             reddit_fname,
                             mode='wb',
                             auto_make_dirs=True)
        self.redditreader = RedditReader(reddit_fname)

    def test_texts(self):
        for text in self.redditreader.texts():
            self.assertIsInstance(text, unicode_type)

    def test_texts_limit(self):
        texts = list(self.redditreader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_texts_min_len(self):
        for text in self.redditreader.texts(min_len=100):
            self.assertTrue(len(text) >= 100)

    def test_records(self):
        for record in self.redditreader.records():
            self.assertIsInstance(record, dict)

    def test_records_limit(self):
        records = list(self.redditreader.records(limit=1))
        self.assertEqual(len(records), 1)

    def test_records_score_range(self):
        score_ranges = [(-2, 2), (5, None), (None, 2)]
        for score_range in score_ranges:
            records = list(self.redditreader.records(score_range=score_range))
            self.assertEqual(len(records), 1)
            for record in records:
                if score_range[0]:
                    self.assertTrue(record['score'] >= score_range[0])
                if score_range[1]:
                    self.assertTrue(record['score'] <= score_range[1])

    def test_records_subreddit(self):
        subreddits = [('exmormon', ), {'CanadaPolitics', 'AdviceAnimals'}]
        expected_lens = (1, 2)
        for subreddit, expected_len in zip(subreddits, expected_lens):
            records = list(self.redditreader.records(subreddit=subreddit))
            self.assertEqual(len(records), expected_len)
            for record in records:
                self.assertTrue(record['subreddit'] in subreddit)

    def tearDown(self):
        shutil.rmtree(self.tempdir)
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
     if PY2 is False:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                          auto_make_dirs=True)
     else:
         write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                          auto_make_dirs=True)
     self.redditreader = RedditReader(reddit_fname)
class RedditReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
        if PY2 is False:
            write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                             auto_make_dirs=True)
        else:
            write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                             auto_make_dirs=True)
        self.redditreader = RedditReader(reddit_fname)

    def test_texts(self):
        for text in self.redditreader.texts():
            self.assertIsInstance(text, unicode_type)

    def test_texts_limit(self):
        texts = list(self.redditreader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_texts_min_len(self):
        for text in self.redditreader.texts(min_len=100):
            self.assertTrue(len(text) >= 100)

    def test_records(self):
        for record in self.redditreader.records():
            self.assertIsInstance(record, dict)

    def test_records_limit(self):
        records = list(self.redditreader.records(limit=1))
        self.assertEqual(len(records), 1)

    def test_records_score_range(self):
        score_ranges = [(-2, 2), (5, None), (None, 2)]
        for score_range in score_ranges:
            records = list(self.redditreader.records(score_range=score_range))
            self.assertEqual(len(records), 1)
            for record in records:
                if score_range[0]:
                    self.assertTrue(record['score'] >= score_range[0])
                if score_range[1]:
                    self.assertTrue(record['score'] <= score_range[1])

    def test_records_subreddit(self):
        subreddits = [('exmormon',), {'CanadaPolitics', 'AdviceAnimals'}]
        expected_lens = (1, 2)
        for subreddit, expected_len in zip(subreddits, expected_lens):
            records = list(self.redditreader.records(subreddit=subreddit))
            self.assertEqual(len(records), expected_len)
            for record in records:
                self.assertTrue(record['subreddit'] in subreddit)

    def tearDown(self):
        shutil.rmtree(self.tempdir)
 def setUp(self):
     self.tempdir = tempfile.mkdtemp(
         prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
     reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
     try:
         with bzip_open(reddit_fname, mode='wt') as f:
             for comment in REDDIT_COMMENTS:
                 f.write(json.dumps(comment, ensure_ascii=False) + '\n')
     except ValueError:  # Python 2 fail
         with bzip_open(reddit_fname, mode='wb') as f:
             for comment in REDDIT_COMMENTS:
                 f.write(json.dumps(comment, ensure_ascii=True) + '\n')
     self.redditreader = RedditReader(reddit_fname)
class RedditReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
        try:
            with bzip_open(reddit_fname, mode='wt') as f:
                for comment in REDDIT_COMMENTS:
                    f.write(json.dumps(comment, ensure_ascii=False) + '\n')
        except ValueError:  # Python 2 fail
            with bzip_open(reddit_fname, mode='wb') as f:
                for comment in REDDIT_COMMENTS:
                    f.write(json.dumps(comment, ensure_ascii=True) + '\n')
        self.redditreader = RedditReader(reddit_fname)

    def test_texts(self):
        texts = list(self.redditreader.texts())
        for text in texts:
            self.assertIsInstance(text, str)

    def test_texts_min_len(self):
        texts = list(self.redditreader.texts(min_len=100))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.redditreader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_comments(self):
        comments = list(self.redditreader.comments())
        for comment in comments:
            self.assertIsInstance(comment, dict)

    def test_pages_min_len(self):
        comments = list(self.redditreader.comments(min_len=100))
        self.assertEqual(len(comments), 1)

    def test_pages_limit(self):
        comments = list(self.redditreader.comments(limit=1))
        self.assertEqual(len(comments), 1)

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)
class RedditReaderTestCase(unittest.TestCase):

    def setUp(self):
        self.tempdir = tempfile.mkdtemp(
            prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__)))
        reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2')
        if PY2 is False:
            write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt',
                             auto_make_dirs=True)
        else:
            write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb',
                             auto_make_dirs=True)
        self.redditreader = RedditReader(reddit_fname)

    def test_texts(self):
        texts = list(self.redditreader.texts())
        for text in texts:
            self.assertIsInstance(text, unicode_type)

    def test_texts_min_len(self):
        texts = list(self.redditreader.texts(min_len=100))
        self.assertEqual(len(texts), 1)

    def test_texts_limit(self):
        texts = list(self.redditreader.texts(limit=1))
        self.assertEqual(len(texts), 1)

    def test_comments(self):
        comments = list(self.redditreader.comments())
        for comment in comments:
            self.assertIsInstance(comment, dict)

    def test_comments_min_len(self):
        comments = list(self.redditreader.comments(min_len=100))
        self.assertEqual(len(comments), 1)

    def test_comments_limit(self):
        comments = list(self.redditreader.comments(limit=1))
        self.assertEqual(len(comments), 1)

    def tearDown(self):
        for fname in os.listdir(self.tempdir):
            os.remove(os.path.join(self.tempdir, fname))
        os.rmdir(self.tempdir)