forked from xlhdh/classycn
/
runhmm-hu.py
81 lines (68 loc) · 2.18 KB
/
runhmm-hu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf8 -*-
import sys
import glob
import datetime
import random
import nltk.tag as nt
import util
" PARAMETERS "
material = 'data/sjw/*'
#material = "data/sjw/A05*"
size = 80
trainportion = 0.9
charstop = False # True means label attributes to previous char
random.seed(101)
" END OF PARAMETERS "
"python runhmm-hu.py 'data/sjw/*' 1000000 1 qualitative/allover-sjw-gold.txt"
"python runhmm-hu.py 'data/24s/*' 10000 1 qualitative/allover-24s-gold.txt"
args = sys.argv
if len(args)>1:
material = args[1]
size = int(args[2])
charstop = int(args[3])
hu = args[4]
cut = int(size*trainportion)
print ("Material:", material)
print ("Size:", size, "entries,", trainportion, "as training")
print ("Starting Time:",datetime.datetime.now())
# Prepare li: list of random lines
print ("Reading from files...")
li = [line for line in util.file_to_lines(glob.glob(material))]
random.shuffle(li)
li = li[:size]
# Prepare data: list of x(char), y(label) sequences
print ("Prepare list of sequences...")
closetestdata = li[:cut]
traindata = []
for line in closetestdata:
x, y = util.line_toseq(line, charstop)
traindata.append(zip(x,y))
# traindata shape: [[(x,y),(x,y), ...],[],[],...]
# testdata shape: [([x1, x2, ...],[y1,y2,...]),([],[])]
stt = datetime.datetime.now()
print (traindata[0])
print ("Start training...", stt)
hmmtagger = nt.hmm.HiddenMarkovModelTagger.train(traindata)
#hmmtagger = nt.HiddenMarkovModelTagger.train(traindata)
print ("################# Training took:", datetime.datetime.now()-stt)
results = []
lines = []
testdata = [line for line in util.file_to_lines(glob.glob(hu))]
for line in testdata:
x, yref = util.line_toseq(line, charstop)
out = hmmtagger.tag(x)
_, yout = zip(*out)
yout = list(yout)
results.append(util.eval(yref, yout, "S"))
lines.append(util.seq_to_line(x,yout,charstop))
tp, fp, fn, tn = zip(*results)
tp, fp, fn, tn = sum(tp), sum(fp), sum(fn), sum(tn)
p, r = tp/(tp+fp), tp/(tp+fn)
print ("Total tokens in Test Set:", tp+fp+fn+tn)
print ("Total S in REF:", tp+fn)
print ("Total S in OUT:", tp+fp)
print ("Presicion:", p)
print ("Recall:", r)
print ("F1-score:", 2*p*r/(p+r))
while lines:
print (lines.pop().encode('utf8'))