/
trainandtest.py
68 lines (57 loc) · 1.96 KB
/
trainandtest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pycrfsuite
import json
import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite.metrics import flat_accuracy_score
from sklearn.metrics import classification_report
trainer = pycrfsuite.Trainer(verbose=False)
train1= open('x_train.txt','r')
X_train= json.load(train1)
test1= open('x_test.txt','r')
X_test = json.load(test1)
test3 = open('x_test1.txt', 'r')
X_test1 = json.load(test3)
train2=open('y_train.txt','r')
y_train = json.load(train2)
test2=open('y_test.txt','r')
Y_test = json.load(test2)
# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
trainer.append(xseq, yseq)
# Set the parameters of the model
trainer.set_params({
# coefficient for L1 penalty
'c1': 0.1,
# coefficient for L2 penalty
'c2': 0.01,
# maximum number of iterations
'max_iterations': 200,
# whether to include transitions that
# are possible, but not observed
'feature.possible_transitions': True
})
# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]
y_pred1 = [tagger.tag(xseq) for xseq in X_test1]
#print(flat_accuracy_score(Y_test,y_pred))
'''
# Let's take a look at a random sample in the testing set
for i in range(len(X_test)):
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
print("%s (%s)" % (y, x))
'''
# Create a mapping of labels to indices
labels = {"NonDrug": 1, "Drug": 0}
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in Y_test for tag in row])
print(flat_accuracy_score(Y_test,y_pred))
# Print out the classification report
print(classification_report(
truths, predictions,
target_names=["NonDrug", "Drug"]))