def test_create_training_data_from_txt(self): training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8') txt_file.write(text) txt_file.close() print(row[1]) training_result = training_data_fact.create('wpg_data.csv', 2, source=sfsf_config.TXT) for file_name in tmp_txt_files: os.remove(file_name) self.assertEqual((253, 21627), training_result['x'].shape) self.assertEqual((253, ), training_result['y'].shape) self.assertEqual(253, len(training_result['isbns']))
def test_extract_paragraph_text(self): parser = epub_to_txt_parser.EPubToTxtParser() items = [] items.append('<docu><p>hello</p><p>world</p></docu>') text = parser.extract_paragraph_text(items) expected = 'hello\nworld\n' self.assertEqual(text, expected)
def test_get_linear_items_data(self): parser = epub_to_txt_parser.EPubToTxtParser() dir_path = os.path.dirname(os.path.realpath(__file__)) items = [] path_to_nantas = os.path.join( sfsf_config.get_data_dir(), 'epub/20150602093137_9789460422515.epub') items = parser.get_linear_items_data(path_to_nantas) self.assertEqual(len(items), 5)
import csv import os import traceback from sfsf import sfsf_config from sfsf import epub_to_txt_parser from sfsf import txt_pre_processor from sfsf import training_data_factory # sfsf_config.set_env( sfsf_config.DEVELOPMENT ) with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'), 'r', encoding="utf-8") as csv_infile: training_data_fact = training_data_factory.TrainingDataFactory() parser = epub_to_txt_parser.EPubToTxtParser() text_preprocessor = txt_pre_processor.TxtPreProcessor() csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"') tmp_txt_files = [] headers = next(csv_reader) for row in csv_reader: try: text = parser.narrative_from_epub_to_txt( training_data_fact.lookup_epub_filename(row[1])) text = text_preprocessor.transform(text) tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(), '{i}.txt'.format(i=row[1])) tmp_txt_files.append(tmp_txt_file_name) txt_file = open(tmp_txt_file_name, 'w', encoding='utf8') txt_file.write(text) txt_file.close() print(row[1], end=' ')
def __init__(self): self.epub_dir_filenames = os.listdir(sfsf_config.get_epub_dir()) self.epub_to_txt_parser = epub_to_txt_parser.EPubToTxtParser() self.txt_pre_processor = txt_pre_processor.TxtPreProcessor()