def test_build_deep_learning_model(self):
     factory = training_data_factory.TrainingDataFactory()
     training_data = factory.create('wpg_data.csv', 2)
     model = deep_learning_model.DeepLearningModel()
     # training_data, batch_size, epochs
     accuracy = model.build((training_data['x'], training_data['y']), 10, 5)
     self.assertEqual(100.00, accuracy)
示例#2
0
 def test_create_training_data_from_txt(self):
     training_data_fact = training_data_factory.TrainingDataFactory()
     parser = epub_to_txt_parser.EPubToTxtParser()
     text_preprocessor = txt_pre_processor.TxtPreProcessor()
     with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
               'r',
               encoding="utf-8") as csv_infile:
         csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
         tmp_txt_files = []
         headers = next(csv_reader)
         for row in csv_reader:
             text = parser.narrative_from_epub_to_txt(
                 training_data_fact.lookup_epub_filename(row[1]))
             text = text_preprocessor.transform(text)
             tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                              '{i}.txt'.format(i=row[1]))
             tmp_txt_files.append(tmp_txt_file_name)
             txt_file = open(tmp_txt_file_name, 'w', encoding='utf-8')
             txt_file.write(text)
             txt_file.close()
             print(row[1])
     training_result = training_data_fact.create('wpg_data.csv',
                                                 2,
                                                 source=sfsf_config.TXT)
     for file_name in tmp_txt_files:
         os.remove(file_name)
     self.assertEqual((253, 21627), training_result['x'].shape)
     self.assertEqual((253, ), training_result['y'].shape)
     self.assertEqual(253, len(training_result['isbns']))
示例#3
0
def do_train(train_sample):
    training_factory = training_data_factory.TrainingDataFactory()
    training_data = training_factory.create_by_sample('wpg_data.csv',
                                                      train_sample["top"],
                                                      train_sample["bottom"],
                                                      sample_size=5000,
                                                      source=sfsf_config.TXT)
    pickle.dump(training_data, open('sfsf_training_data.pickle', 'wb'))
 def test_prediction_wrong_test_dimension(self):
     factory = training_data_factory.TrainingDataFactory()
     training_data = factory.create('wpg_data.csv', 2)
     model = deep_learning_model.DeepLearningModel()
     # training_data, batch_size, epochs
     accuracy = model.build((training_data['x'], training_data['y']), 10, 5)
     with self.assertRaises(deep_learning_model.TestingDimensionError):
         model.predict(numpy.array([[1, 2, 3], [1, 2, 3]]))
示例#5
0
 def test_get_top_bottom(self):
     training_data = training_data_factory.TrainingDataFactory()
     samples_tuple = training_data.get_top_bottom('wpg_data.csv', cull=2)
     csv = pandas.read_csv(
         os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'))
     smallest_sale = min(csv['totaal afzet'])
     self.assertEqual(smallest_sale, int(samples_tuple[1][1][4]))
     highest_sale = max(csv['totaal afzet'])
     self.assertEqual(highest_sale, int(samples_tuple[0][0][4]))
示例#6
0
 def test_sampling(self):
     isbn_info = [['', 9789023449416, '']]
     training_data = training_data_factory.TrainingDataFactory()
     samples = training_data.sample_epubs(isbn_info, 1000)
     # expect a tuple (isbn, list of strings)
     self.assertEqual(1, len(samples))
     self.assertEqual(9789023449416, samples[0][0])
     self.assertEqual(72, len(samples[0][1]))
     for sample in samples[0][1]:
         self.assertEqual(str, type(sample))
示例#7
0
 def test_file_name_lookup(self):
     training_data = training_data_factory.TrainingDataFactory()
     assert (training_data.lookup_epub_filename('9789044964264').endswith(
         '20160113113032_9789044964264.epub'))
     # one should be returned in the case of multiples, don't care
     # which one.
     any_which = lambda file_name: file_name.endswith(
         '20150602093137_9789023449416.epub') or file_name.endswith(
             '20160113113032_9789023449416.epub')
     result = training_data.lookup_epub_filename('9789023449416')
     assert (any_which(result))
示例#8
0
def do_sample(top_chunk_scores, bottom_chunk_scores, wpg_data_file):
    training_factory = training_data_factory.TrainingDataFactory()
    isbn_data = training_factory.get_isbn_data(
        wpg_data_file)  # returns data sorted by sales
    top_isbn_data = [
        isbn_row for isbn_row in isbn_data if isbn_row[1] in top_chunk_scores
    ]
    bottom_isbn_data = [
        isbn_row for isbn_row in isbn_data
        if isbn_row[1] in bottom_chunk_scores
    ]
    return top_isbn_data, bottom_isbn_data
 def test_predictions(self):
     factory = training_data_factory.TrainingDataFactory()
     training_data = factory.create('wpg_data.csv', 2)
     model = deep_learning_model.DeepLearningModel()
     # training_data, batch_size, epochs
     accuracy = model.build((training_data['x'], training_data['y']), 10, 5)
     vect = training_data['vectorizer']
     isbn_info = [['', 9789023449416, '']]
     test_tuples = factory.sample_epubs(isbn_info, 1000)[-4:]
     test_samples = [
         test_sample for tupel in test_tuples for test_sample in tupel[1]
     ]
     test_tdm = vect.transform(test_samples)
     predictions = model.predict(numpy.array(test_tdm.toarray()))
     for idx, prediction in enumerate(predictions):
         assert (prediction[0] > 0.9)
示例#10
0
def do_sample(wpg_data_file, train_size, test_size, total_size):
    training_factory = training_data_factory.TrainingDataFactory()
    isbn_data = training_factory.get_isbn_data(
        wpg_data_file)  # returns data sorted by sales
    top_data = isbn_data[:total_size]
    bottom_data = isbn_data[-total_size:]
    shuffle(top_data)
    shuffle(bottom_data)
    train_sample = {
        "top": top_data[:train_size["top"]],
        "bottom": bottom_data[:train_size["bottom"]],
    }
    test_sample = {
        "top": top_data[-test_size["top"]:],
        "bottom": bottom_data[-test_size["bottom"]:]
    }
    return train_sample, test_sample
示例#11
0
def do_test(test_sample, train_size, test_size, total_size, iteration):
    # Test training data and predict
    training_factory = training_data_factory.TrainingDataFactory()
    training_data = pickle.load(open('sfsf_training_data.pickle', 'rb'))
    vectorizer = training_data['vectorizer']
    testing_data_top = training_factory.sample_txts(test_sample["top"], 5000)
    testing_data_bottom = training_factory.sample_txts(test_sample["bottom"],
                                                       5000)
    isbns_top = [tuple[0] for tuple in testing_data_top for sample in tuple[1]]
    isbns_bottom = [
        tuple[0] for tuple in testing_data_bottom for sample in tuple[1]
    ]
    testing_tdm_top = vectorizer.transform(
        [sample for tuple in testing_data_top for sample in tuple[1]])
    testing_tdm_bottom = vectorizer.transform(
        [sample for tuple in testing_data_bottom for sample in tuple[1]])
    model = deep_learning_model.DeepLearningModel()
    # training_data, batch_size, epochs
    accuracy = model.build((training_data['x'], training_data['y']), 10, 5)
    model.save('sfsf_deeplearning_model_{d}'.format(
        d=datetime.now().strftime('%Y%m%d_%H%M')))
    predictions = model.predict(numpy.array(testing_tdm_top.toarray()))
    predictions_top = []
    predictions_bottom = []
    for idx, prediction in enumerate(predictions):
        predictions_top.append((isbns_top[idx], prediction[0]))
    predictions = model.predict(numpy.array(testing_tdm_bottom.toarray()))
    for idx, prediction in enumerate(predictions):
        predictions_bottom.append((isbns_bottom[idx], prediction[0]))

    # reporting
    report_file_name = 'report-total-{total}-train-{train}-test-{test}-iteration-{i}-date-{d}.csv'.format(
        total=total_size,
        train=train_size,
        test=test_size,
        i=iteration,
        d=datetime.now().strftime('%Y%m%d_%H%M'))
    report_file = open(report_file_name, 'w', encoding='utf8')
    csv_writer = csv.writer(report_file, delimiter=',')
    training_factory = training_data_factory.TrainingDataFactory()
    isbn_data = training_factory.get_isbn_data('wpg_data.csv')
    headers = [
        'deep_learning_data_type', 'NUR', 'ISBN', 'title', 'author',
        'total sold', 'prediction'
    ]
    csv_writer.writerow(headers)
    print('\t'.join(headers))
    training_isbns_reported = []
    # combine training data y and isbns
    for idx, item in enumerate(training_data['y']):
        if not training_data['isbns'][idx] in training_isbns_reported:
            report_row = []
            # 1 is top, 0 is flop
            if item == 1:
                report_row.append('training_top')
            else:
                report_row.append('training_bottom')
            #isbn: find the data
            isbn_info = next(isbn_inf for isbn_inf in isbn_data
                             if isbn_inf[1] == training_data['isbns'][idx])
            report_row.extend(isbn_info)
            report_row.append('NA')
            report(report_row, csv_writer)
            training_isbns_reported.append(training_data['isbns'][idx])
    for prediction in predictions_top:
        report_row = []
        report_row.append('testing_top')
        #isbn: find the data
        isbn_info = next(isbn_inf for isbn_inf in isbn_data
                         if isbn_inf[1] == prediction[0])
        report_row.extend(isbn_info)
        report_row.append(str(prediction[1]))
        report(report_row, csv_writer)
    for prediction in predictions_bottom:
        report_row = []
        report_row.append('testing_bottom')
        #isbn: find the data
        isbn_info = next(isbn_inf for isbn_inf in isbn_data
                         if isbn_inf[1] == prediction[0])
        report_row.extend(isbn_info)
        report_row.append(str(prediction[1]))
        report(report_row, csv_writer)
    report_file.close()
示例#12
0
def get_text_chunks(sample_data):
    training_factory = training_data_factory.TrainingDataFactory()
    return training_factory.sample_txts(sample_data, sample_size=5000)
示例#13
0
 def test_create_training_data(self):
     training_data = training_data_factory.TrainingDataFactory()
     training_result = training_data.create('wpg_data.csv', 2)
     self.assertEqual((253, 21627), training_result['x'].shape)
     self.assertEqual((253, ), training_result['y'].shape)
示例#14
0
import csv
import os
import traceback
from sfsf import sfsf_config
from sfsf import epub_to_txt_parser
from sfsf import txt_pre_processor
from sfsf import training_data_factory

# sfsf_config.set_env( sfsf_config.DEVELOPMENT )

with open(os.path.join(sfsf_config.get_data_dir(), 'wpg_data.csv'),
          'r',
          encoding="utf-8") as csv_infile:
    training_data_fact = training_data_factory.TrainingDataFactory()
    parser = epub_to_txt_parser.EPubToTxtParser()
    text_preprocessor = txt_pre_processor.TxtPreProcessor()
    csv_reader = csv.reader(csv_infile, delimiter=',', quotechar='"')
    tmp_txt_files = []
    headers = next(csv_reader)
    for row in csv_reader:
        try:
            text = parser.narrative_from_epub_to_txt(
                training_data_fact.lookup_epub_filename(row[1]))
            text = text_preprocessor.transform(text)
            tmp_txt_file_name = os.path.join(sfsf_config.get_txt_dir(),
                                             '{i}.txt'.format(i=row[1]))
            tmp_txt_files.append(tmp_txt_file_name)
            txt_file = open(tmp_txt_file_name, 'w', encoding='utf8')
            txt_file.write(text)
            txt_file.close()
            print(row[1], end=' ')