示例#1
0
 def test_create_training_data_from_txt(self):
     training_data_fact = training_data_factory.TrainingDataFactory()
     parser = epub_to_txt_parser.EPubToTxtParser()
     text_preprocessor = txt_pre_processor.TxtPreProcessor()
     with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
               'r',
               encoding="utf-8") as csv_infile:
         csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
         tmp_txt_files = []
         headers = next(csv_reader)
         for row in csv_reader:
             text = parser.narrative_from_epub_to_txt(
                 training_data_fact.lookup_epub_filename(row[1]))
             text = text_preprocessor.transform(text)
             tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                              '{i}.txt'.format(i=row[1]))
             tmp_txt_files.append(tmp_txt_file_name)
             txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8')
             txt_file.write(text)
             txt_file.close()
             print(row[1])
     training_result = training_data_fact.create('wpg_data.csv',
                                                 2,
                                                 source=sfsf_config.TXT)
     for file_name in tmp_txt_files:
         os.remove(file_name)
     self.assertEqual((253, 21627), training_result['x'].shape)
     self.assertEqual((253, ), training_result['y'].shape)
     self.assertEqual(253, len(training_result['isbns']))
示例#2
0
 def test_extract_paragraph_text(self):
     parser = epub_to_txt_parser.EPubToTxtParser()
     items = []
     items.append('<docu><p>hello</p><p>world</p></docu>')
     text = parser.extract_paragraph_text(items)
     expected = 'hello\nworld\n'
     self.assertEqual(text, expected)
示例#3
0
 def test_get_linear_items_data(self):
     parser = epub_to_txt_parser.EPubToTxtParser()
     dir_path = os.path.dirname(os.path.realpath(__file__))
     items = []
     path_to_nantas = os.path.join(
         sfsf_config.get_data_dir(),
         'epub/20150602093137_9789460422515.epub')
     items = parser.get_linear_items_data(path_to_nantas)
     self.assertEqual(len(items), 5)
示例#4
0
import csv
import os
import traceback
from sfsf import sfsf_config
from sfsf import epub_to_txt_parser
from sfsf import txt_pre_processor
from sfsf import training_data_factory

# sfsf_config.set_env( sfsf_config.DEVELOPMENT )

with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
          'r',
          encoding="utf-8") as csv_infile:
    training_data_fact = training_data_factory.TrainingDataFactory()
    parser = epub_to_txt_parser.EPubToTxtParser()
    text_preprocessor = txt_pre_processor.TxtPreProcessor()
    csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
    tmp_txt_files = []
    headers = next(csv_reader)
    for row in csv_reader:
        try:
            text = parser.narrative_from_epub_to_txt(
                training_data_fact.lookup_epub_filename(row[1]))
            text = text_preprocessor.transform(text)
            tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                             '{i}.txt'.format(i=row[1]))
            tmp_txt_files.append(tmp_txt_file_name)
            txt_file = open(tmp_txt_file_name, 'w', encoding='utf8')
            txt_file.write(text)
            txt_file.close()
            print(row[1], end=' ')
示例#5
0
 def __init__(self):
     self.epub_dir_filenames = os.listdir(sfsf_config.get_epub_dir())
     self.epub_to_txt_parser = epub_to_txt_parser.EPubToTxtParser()
     self.txt_pre_processor = txt_pre_processor.TxtPreProcessor()