/
spam.py
130 lines (92 loc) · 3.64 KB
/
spam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import json
import timeit
from collections import OrderedDict
import numpy as np
from spam.common import utils
from spam.dataset import EnronDataset
from spam.preprocess import Preprocess
from spam.deeplearning import StackedDenoisingAutoEncoder
start_time = timeit.default_timer()
np.random.seed(1337)
CONFIG_FILENAME = 'config.json'
with open(CONFIG_FILENAME, 'r') as f:
CONFIG = json.load(f, object_pairs_hook=OrderedDict)
if not CONFIG:
print('Can\'t read config file.')
sys.exit()
print('\n{}\n'.format('-' * 50))
if CONFIG['dataset']['generate']:
print('Reading the dataset..')
dataset = EnronDataset(path=CONFIG['dataset']['path'])
enron_dataset = dataset.get_dataset()
if CONFIG['dataset']['output']:
print('Exporting the dataset..')
dataset.to_csv(filepath=CONFIG['dataset']['filepath'])
if CONFIG['preprocess']['params']['read_csv']:
print('Reading the dataset..')
preprocessor = Preprocess(**CONFIG['preprocess']['params'])
else:
preprocessor = Preprocess(dataset=enron_dataset,
**CONFIG['preprocess']['params'])
if CONFIG['preprocess']['clean_dataset']:
print('Cleaning the dataset..')
preprocessor.clean_data()
if CONFIG['preprocess']['output_csv']:
print('Exporting clean dataset..')
preprocessor.dataset.to_csv(CONFIG['preprocess']['output_csv_filepath'])
print('Spliting the dataset..')
enron_dataset = preprocessor.dataset
enron_dataset = utils.split_dataset(x=enron_dataset['body'].values,
y=enron_dataset['label'].values)
print('Transforming dataset into vectors and matrices..')
enron_dataset = preprocessor.transform(dataset=enron_dataset)
vocabulary = preprocessor.vocabulary
print('\n{}\n'.format('-' * 50))
print('Building model..')
sda = StackedDenoisingAutoEncoder(**CONFIG['model']['params'])
print('Pretraining model..')
pretraining_history, finetune_history = sda.train(enron_dataset)
print('\n{}\n'.format('-' * 50))
print('Evaluating model..')
metrics = sda.evaluate(dataset=enron_dataset)
for key, value in metrics.items():
print('{}: {}'.format(key, value))
print('\n{}\n'.format('-' * 50))
exp_dir = 'experiments/exp_{}'.format(CONFIG['id'])
print('Saving config results inside {}'.format(exp_dir))
os.makedirs(exp_dir, exist_ok=True)
open('{}/model_structure.json'.format(exp_dir), 'w') \
.write(sda.model.to_json())
sda.model.save_weights('{}/model_weights.hdf5'
.format(exp_dir), overwrite=True)
data_meta = utils.get_dataset_meta(dataset=enron_dataset)
with open('{}/metrics.json'.format(exp_dir), 'w') as f:
json.dump(metrics, f, indent=4)
with open('{}/data_meta.json'.format(exp_dir), 'w') as f:
json.dump(data_meta, f, indent=4)
with open('{}/vocabulary.json'.format(exp_dir), 'w') as f:
json.dump(vocabulary, f)
with open('{}/{}'.format(exp_dir, CONFIG_FILENAME), 'w+') as f:
json.dump(CONFIG, f, indent=4)
for i, loss_history in enumerate(pretraining_history, start=1):
utils.plot_loss_history(
data=loss_history,
title='Pretraining loss history of hidden layer #{}'.format(i),
name='L{}_pretraining_loss'.format(i),
path=exp_dir,
)
utils.plot_loss_history(data=finetune_history,
title='Finetune loss history',
name='finetune_loss',
path=exp_dir, )
print('Updating config id..')
CONFIG['id'] += 1
with open(CONFIG_FILENAME, 'w+') as f:
json.dump(CONFIG, f, indent=4)
end_time = timeit.default_timer()
print('Done!')
print('Run for %.2fm' % ((end_time - start_time) / 60.0))