/
preprocess-thyme.py
executable file
·45 lines (28 loc) · 1.37 KB
/
preprocess-thyme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Preprocessing script for thyme data.
"""
import os
import glob
from utils import preprocess_data, preprocess_test_data_phase2, make_dirs, build_vocab, build_word2Vector
if __name__ == '__main__':
window_size = 4
num_feats=3
base_dir = os.path.dirname(os.path.realpath(__file__))
data_dir = os.path.join(base_dir, 'data')
ann_dir = os.path.join(base_dir, 'annotation/coloncancer')
plain_dir = os.path.join(base_dir, 'original')
train_dir = os.path.join(data_dir, 'train')
dev_dir = os.path.join(data_dir, 'dev')
test_dir = os.path.join(data_dir, 'test')
make_dirs([train_dir, dev_dir, test_dir])
preprocess_data(os.path.join(ann_dir, "Train"), os.path.join(plain_dir, "train"),
train_dir, window_size, num_feats)
preprocess_data(os.path.join(ann_dir, "Dev"), os.path.join(plain_dir, "dev"),
dev_dir, window_size, num_feats)
ann_dir_2 = os.path.join(base_dir, 'thymedata-1.2.0-coloncancer-test-event-time/coloncancer')
preprocess_test_data_phase2(os.path.join(plain_dir, "test"), os.path.join(ann_dir_2, "Test"), test_dir, window_size, num_feats)
build_vocab(
glob.glob(os.path.join(data_dir, '*/*.toks')),
os.path.join(data_dir, 'vocab-cased.txt'),
lowercase=False)
build_word2Vector(os.path.join('../NLP-Tools', 'glove.840B.300d.txt'), data_dir, 'vocab-cased.txt')