def word_tokens(self): """ """ # TODO: Get wordtokens from alltokens when care enough if self.word_tokens_pickle_path is None: return self.pos_tag(self.words) data = [] if get_loose_filepath(self.word_tokens_pickle_path): data = get_file_content(get_loose_filepath( self.word_tokens_pickle_path) ) else: data = self.pos_tag(self.words) try: f_pickle = open(self.word_tokens_pickle_path, 'wb') pickle.dump(data, f_pickle) f_pickle.close() except Exception as ex: logger.error( 'Could not open file %s: %s' % ( self.word_tokens_pickle_path, ex ) ) return data
def test_compress_file(self): book = Book(title='t', author='a', year_published=0, content_path=self.clean_epub_1) book.all_tokens book.word_tokens compress_file(book.all_tokens_pickle_path) self.assertTrue(os.path.exists(book.all_tokens_pickle_path + '.bz')) content_pickle = get_file_content(book.all_tokens_pickle_path) content_bz = get_file_content(book.all_tokens_pickle_path + '.bz') self.assertEqual(content_pickle, content_bz) with self.assertRaises(NotImplementedError): compress_file(self.text_path_1)
def test_to_text_content(self): process_files(num_processes=1) content = get_file_content( os.path.join( TEXT_PATH, '9783161484102___Author A. Author___Another Great Title.txt')) self.assertEquals(content, '''Intro heading blah blah blah ''')
def test_compress_files(self): book = Book(title='t', author='a', year_published=0, content_path=self.clean_epub_1) book.all_tokens book.word_tokens content_pickle = get_file_content(book.word_tokens_pickle_path) compress(num_processes=1) # make sure files deleted and bz created self.assertTrue(os.path.exists(book.word_tokens_pickle_path + '.bz')) self.assertFalse(os.path.exists(book.word_tokens_pickle_path)) content_bz = get_file_content(book.word_tokens_pickle_path + '.bz') self.assertEqual(content_pickle, content_bz)
def all_tokens(self): if self.all_tokens_pickle_path is None: return self.pos_tag(nltk.word_tokenize(self.content)) data = [] if get_loose_filepath(self.all_tokens_pickle_path): data = get_file_content(self.all_tokens_pickle_path) else: data = self.pos_tag(nltk.word_tokenize(self.content)) try: f_pickle = open(self.all_tokens_pickle_path, 'wb') pickle.dump(data, f_pickle) f_pickle.close() except Exception as ex: logger.error( 'Could not open file %s: %s' % ( self.all_tokens_pickle_path, ex ) ) return data