-
Notifications
You must be signed in to change notification settings - Fork 0
/
MatchmakerPredictor.py
266 lines (182 loc) · 6.29 KB
/
MatchmakerPredictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
import os, re
from tabulate import tabulate
import common
from sklearn.base import BaseEstimator, ClassifierMixin
# local modules
import loinc
from loinc import LoincMTRT as lmt
# from utils_plot import saveFig # contains "matplotlib.use('Agg')" which needs to be called before pyplot
# from matplotlib import pyplot as plt
"""
Rule-based LOINC predictor.
Reference
---------
1. Leela: see classification rules that lead to medivo_test_result_type
https://github.com/medivo/leela
Memo
----
* Cleaning texts
https://machinelearningmastery.com/clean-text-machine-learning-python/
* String matching and similarity:
a. Levenshtein distance
pip install python-Levenshtein
b. td-idf vectorizer
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
Update
------
"""
p_oc = p_organic_compound = re.compile(r'(?P<prefix>[0-9\',]+)-(?P<suffix>\w+)')
class ReadTxtFiles(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), encoding='latin'):
yield simple_preprocess(line)
class MTRTClassifier(BaseEstimator, ClassifierMixin):
"""
Predict LOINC codes by MTRT texts.
Ref
---
1. customized sklearn estimator:
http://danielhnyk.cz/creating-your-own-estimator-scikit-learn/
"""
def __init__(self, source_table=''):
"""
Called when initializing the classifier
"""
self.source_table = source_table
if not source_table: self.source_table = lmt.table
self.table = lmt.load_table(dehyphenate=True, dequote=True) # input_file/LoincMTRT.table
def fit(self, X, y=None):
"""
Memo
----
This should fit classifier. All the "work" should be done here.
Note: assert is not a good choice here and you should rather
use try/except blog with exceptions. This is just for short syntax.
"""
self.threshold_ = 0.5
return self
def to_label(self, x):
# returns True/False according to fitted classifier
# notice underscore on the beginning
return( 1 if x >= self.threshold_ else 0 )
def predict(self, X, y=None):
try:
getattr(self, "threshold_")
except AttributeError:
raise RuntimeError("You must train classifer before predicting data!")
return([self.to_label(x) for x in X])
def score(self, X, y=None):
# counts number of values bigger than mean
return(sum(self.predict(X)))
class FeatureSet(loinc.FeatureSet):
matching_cols = [
"meta_sender_name",
"receiving_organization_id",
"test_order_code",
"test_order_name",
"test_result_code",
"test_result_name",
# "test_result_loinc_code",
"test_result_units_of_measure",
"panel_order_name",
]
# ... assuming that all the matching cols are categorical variables
corpus_src_cols = [
'test_order_name',
'test_result_name',
'test_specimen_type',
'panel_order_name',
'test_result_units_of_measure',
# additionally, we'll usually incorporate LOINC's long common name and MTRT
]
### class FeatureSet
def evaluate_vars_given_candidate(row, code):
"""
Given a row from the training data and a candidate code,
compute its feature values.
"""
#
def predict_by_mtrt(mtrt_str='', target_code=None, df=None, **kargs):
"""
Paramms
-------
code: if None, then predict a LOINC code
if a LOINC code is given, then predict it correctness by outputing a probability score
Output
------
a dictionary mapping from codes to probabilities
Memo
----
1. Case 1: MTRT is missing => infer from test_result_name, test_result_value and other attribtutes
Case 2: MTRT is available =>
compute (weighted) string distance, weighted by importance of tokens (e.g. td-idf scores)
"""
from loinc import LoincMTRT
col_key = kargs.get('col_key', lmt.col_key) # 'Test Result LOINC Code'
if df is None: df = LoincMTRT.load_table(dehyphenate=True, dequote=True) # input_file/LoincMTRT.table
# verify
for code in df['Test Result LOINC Code'].values:
assert code.find('-') < 0
for v in df['Medivo Test Result Type'].values:
assert v.find('"') < 0
assert len(mtrt_str) > 0 or code is not None
print("(predict_by_mtrt) df.columns: {}".format(df.columns.values))
print("(predict_by_mtrt) dim(df): {} | n_codes".format(df.shape, len(df[col_key].unique())) )
# string matching algorithm
df = process_loinc_to_mtrt(df)
LoincMTRT.save_derived_loinc_to_mtrt(df)
o = process_each('Albumin [Mass/volume] in Urine', code=17541)
print(o)
return
def encode_mtrt_tfidf(docs):
"""
Memo
----
1. Medivo Test Result Type
"""
return
def demo_read(**kargs):
from gensim import corpora
path_to_text_directory = "lsa_sports_food_docs"
dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))
# Token to Id map
dictionary.token2id
# {'across': 0,
# 'activity': 1,
# 'although': 2,
# 'and': 3,
# 'are': 4,
# ...
# }
return
def demo_tfidf(**kargs):
from gensim.utils import simple_preprocess
from smart_open import smart_open
from gensim import models
from gensim import corpora
from pprint import pprint
documents = ["This is the first line",
"This is the second sentence",
"This third document"]
# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]
def demo_experta(**kargs):
from random import choice
# from experta import *
return
def demo_predict(**kargs):
predict_by_mtrt()
return
def test(**kargs):
### prediction by loinc-to-mtrt mapping
demo_predict()
return
if __name__ == "__main__":
test()