Пример #1
0
    def word_tokens(self):
        """
        """
        # TODO: Get wordtokens from alltokens when care enough

        if self.word_tokens_pickle_path is None:
            return self.pos_tag(self.words)

        data = []
        if get_loose_filepath(self.word_tokens_pickle_path):
            data = get_file_content(get_loose_filepath(
                self.word_tokens_pickle_path)
            )
        else:
            data = self.pos_tag(self.words)
            try:
                f_pickle = open(self.word_tokens_pickle_path, 'wb')
                pickle.dump(data, f_pickle)
                f_pickle.close()
            except Exception as ex:
                logger.error(
                    'Could not open file %s: %s' % (
                        self.word_tokens_pickle_path,
                        ex
                    )
                )

        return data
Пример #2
0
    def test_compress_file(self):
        book = Book(title='t',
                    author='a',
                    year_published=0,
                    content_path=self.clean_epub_1)
        book.all_tokens
        book.word_tokens

        compress_file(book.all_tokens_pickle_path)
        self.assertTrue(os.path.exists(book.all_tokens_pickle_path + '.bz'))

        content_pickle = get_file_content(book.all_tokens_pickle_path)
        content_bz = get_file_content(book.all_tokens_pickle_path + '.bz')

        self.assertEqual(content_pickle, content_bz)

        with self.assertRaises(NotImplementedError):
            compress_file(self.text_path_1)
Пример #3
0
    def test_to_text_content(self):
        process_files(num_processes=1)

        content = get_file_content(
            os.path.join(
                TEXT_PATH,
                '9783161484102___Author A. Author___Another Great Title.txt'))
        self.assertEquals(content, '''Intro heading 
 blah blah blah 
 ''')
Пример #4
0
    def test_compress_files(self):
        book = Book(title='t',
                    author='a',
                    year_published=0,
                    content_path=self.clean_epub_1)
        book.all_tokens
        book.word_tokens

        content_pickle = get_file_content(book.word_tokens_pickle_path)

        compress(num_processes=1)

        # make sure files deleted and bz created

        self.assertTrue(os.path.exists(book.word_tokens_pickle_path + '.bz'))
        self.assertFalse(os.path.exists(book.word_tokens_pickle_path))

        content_bz = get_file_content(book.word_tokens_pickle_path + '.bz')

        self.assertEqual(content_pickle, content_bz)
Пример #5
0
    def all_tokens(self):
        if self.all_tokens_pickle_path is None:
            return self.pos_tag(nltk.word_tokenize(self.content))

        data = []
        if get_loose_filepath(self.all_tokens_pickle_path):
            data = get_file_content(self.all_tokens_pickle_path)
        else:
            data = self.pos_tag(nltk.word_tokenize(self.content))
            try:
                f_pickle = open(self.all_tokens_pickle_path, 'wb')
                pickle.dump(data, f_pickle)
                f_pickle.close()
            except Exception as ex:
                logger.error(
                    'Could not open file %s: %s' % (
                        self.all_tokens_pickle_path,
                        ex
                    )
                )

        return data