def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = [ 'Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting' ] self.top_k = top_k
def __init__(self): self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json') self.cnn_clfs = [get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck')) self.scale_constants = {'cnn': {'mean': 0.15592811611054261, 'std': 0.22405916984696986, 'weight': 1.6666666666666667}, 'ptyp': {'mean': 0.055155532891381948, 'std': 0.22828359573751594}, 'svm': {'mean': -0.75481403525485891, 'std': 0.7812955939364481, 'weight': 10.0}} # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models) self.thresholds = {'cnn': {'precise': 2.1340457758193034, 'sensitive': -0.076709540491855063}, 'cnn_ptyp': {'precise': 3.529609848417909, 'sensitive': 0.083502632442633312}, 'svm': {'precise': 1.9185522606237164, 'sensitive': 0.093273630980694439}, 'svm_cnn': {'precise': 1.8749128673557529, 'sensitive': 0.064481902000491614}, 'svm_cnn_ptyp': {'precise': 3.7674045603568755, 'sensitive': 0.1952449060483534}, 'svm_ptyp': {'precise': 3.7358855328111837, 'sensitive': 0.42992224964656178}}# All precise models have been calibrated to 97.6% sensitivity
def __init__(self): from keras.preprocessing import sequence from keras.models import load_model from keras.models import Sequential from keras.preprocessing import sequence from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten from keras.layers import Embedding from keras.layers import Convolution1D, MaxPooling1D from keras import backend as K from keras.models import Model from keras.regularizers import l2 global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten global Embedding, Convolution1D, MaxPooling1D, K, Model, l2 self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f)
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = { 'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment', 'IOD': 'Incomplete outcome data', 'SR': 'Selective reporting' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) self.CNN_models[bias_domain] = RationaleCNN( preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
def __init__(self, top_k=2, min_k=1): """ In most cases, a fixed number of sentences (top_k) will be returned for each document, *except* when the decision scores are below a threshold (i.e. the implication being that none of the sentences are relevant). top_k = the default number of sentences to retrive per document min_k = ensure that at at least min_k sentences are always returned """ logging.debug("Loading PICO classifiers") self.P_clf = MiniClassifier(robotreviewer.get_data("pico/P_model.npz")) self.I_clf = MiniClassifier(robotreviewer.get_data("pico/I_model.npz")) self.O_clf = MiniClassifier(robotreviewer.get_data("pico/O_model.npz")) logging.debug("PICO classifiers loaded") logging.debug("Loading IDF weights") with open(robotreviewer.get_data("pico/P_idf.npz"), 'rb') as f: self.P_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) with open(robotreviewer.get_data("pico/I_idf.npz"), 'rb') as f: self.I_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) with open(robotreviewer.get_data("pico/O_idf.npz"), 'rb') as f: self.O_idf = diags( np.load(f, allow_pickle=True, encoding='latin1').item().todense().A1, 0) logging.debug("IDF weights loaded") self.vec = PICO_vectorizer() self.models = [self.P_clf, self.I_clf, self.O_clf] self.idfs = [self.P_idf, self.I_idf, self.O_idf] self.PICO_domains = ["Population", "Intervention", "Outcomes"] # if config.USE_METAMAP: # self.metamap = MetaMap.get_instance() self.top_k = top_k self.min_k = min_k
def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f)
def __init__(self, top_k=None): self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = {'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) preprocessor.tokenizer.oov_token = None self.CNN_models[bias_domain] = RationaleCNN(preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.doc_clf = MiniClassifier( robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz'))) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2)) self.bias_domains = [ 'random_sequence_generation', 'allocation_concealment', 'blinding_participants_personnel' ] self.top_k = top_k
class TestMiniClassifier(unittest.TestCase): doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) util = Utilities() def test_init(self): ''' test for MiniClassifier.__init__() ''' self.assertTrue(isinstance(self.doc_clf.coef, np.ndarray)) self.assertTrue(isinstance(self.doc_clf.intercept, float)) def test_decision_function(self): ''' test for MiniClassifier.decision_function(X) ''' X = self.util.load_sparse_csr("X_data.npz") dec = self.doc_clf.decision_function(X) # [ 1.50563252] decTest = np.float64([1.50563252]) ''' can't do: print(np.array_equal(dec, y)) print(np.array_equiv(dec, y)) since as decimals these will not pass ''' self.assertTrue(np.allclose(dec, decTest)) def test_predict(self): ''' test for MiniClassifier.predict(X) ''' X = self.util.load_sparse_csr("X_data.npz") pred = self.doc_clf.predict(X) # [1] self.assertEqual(pred, np.int(1)) def test_predict_proba(self): ''' tests for MiniClassifier.predict_proba(X) ''' with open(ex_path + "rationale_robot_data.json", "r", encoding="utf-8") as data: data = json.load(data) bpl = data["bias_prob_linear"] X = self.util.load_sparse_csr("X_data.npz") bpl_test = self.doc_clf.predict_proba(X)[0] self.assertTrue(abs(bpl - bpl_test) < 0.01)
class RCTRobot: def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json') self.cnn_clfs = [ get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck')) self.scale_constants = { 'cnn': { 'mean': 0.15592811611054261, 'std': 0.22405916984696986, 'weight': 1.6666666666666667 }, 'ptyp': { 'mean': 0.055155532891381948, 'std': 0.22828359573751594 }, 'svm': { 'mean': -0.75481403525485891, 'std': 0.7812955939364481, 'weight': 10.0 } } # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models) self.thresholds = { 'cnn': { 'precise': 2.1340457758193034, 'sensitive': -0.076709540491855063 }, 'cnn_ptyp': { 'precise': 3.529609848417909, 'sensitive': 0.083502632442633312 }, 'svm': { 'precise': 1.9185522606237164, 'sensitive': 0.093273630980694439 }, 'svm_cnn': { 'precise': 1.8749128673557529, 'sensitive': 0.064481902000491614 }, 'svm_cnn_ptyp': { 'precise': 3.7674045603568755, 'sensitive': 0.1952449060483534 }, 'svm_ptyp': { 'precise': 3.7358855328111837, 'sensitive': 0.42992224964656178 } } # All precise models have been calibrated to 97.6% sensitivity # All sensitive models have been calibrated to 99.1% sensitivity def annotate(self, data): # use the best performing models from the validation paper (in draft...) filter_class = "svm_cnn_ptyp" threshold_class = "precise" if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return data if "pubmed" in data.data: ptyp = 1.0 else: ptyp = 0.0 X_ti_str = [ti] X_ab_str = ['{}\n\n{}'.format(ti, ab)] if "svm" in filter_class: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab])) svm_scale = (svm_preds - self.scale_constants['svm']['mean'] ) / self.scale_constants['svm']['std'] if "ptyp" in filter_class: ptyp = np.array([ptyp]) ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean'] ) / self.scale_constants['ptyp']['std'] if "cnn" in filter_class: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs] cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean'] ) / self.scale_constants['cnn']['std'] if filter_class == "svm": y_preds = svm_scale elif filter_class == "svm_ptyp": y_preds = svm_scale + ptyp_scale elif filter_class == "ptyp": y_preds = ptyp_scale elif filter_class == "svm_cnn_ptyp": weights = [self.scale_constants['svm']['weight']] + ( [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs)) y_preds = np.average(np.vstack([cnn_scale, svm_scale]), axis=0, weights=weights) + ptyp_scale structured_data = { "is_rct": bool(y_preds[0] > self.thresholds[filter_class][threshold_class]), "decision_score": y_preds[0], "model_class": filter_class } data.ml["rct"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia
class RCTRobot: def __init__(self): from keras.preprocessing import sequence from keras.models import load_model from keras.models import Sequential from keras.preprocessing import sequence from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten from keras.layers import Embedding from keras.layers import Convolution1D, MaxPooling1D from keras import backend as K from keras.models import Model from keras.regularizers import l2 global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten global Embedding, Convolution1D, MaxPooling1D, K, Model, l2 self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f) self.calibration_lr = {} with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f) with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f: self.calibration_lr['svm_cnn'] = pickle.load(f) def _process_ptyp(self, data_row, strict=True): """ Takes in a data row which might include rct_ptyp or ptyp fields. If strict=True, then raises exception when passed any contradictory data Returns: 1 = ptyp is RCT 0 = ptyp is NOT RCT -1 = no ptyp information present """ if data_row['use_ptyp'] == False: return -1 elif data_row['use_ptyp'] == True: return 1 if any( (tag in data_row['ptyp'] for tag in ["Randomized Controlled Trial", "D016449"])) else 0 else: raise Exception("unexpected value for 'use_ptyp'") def api_annotate(self, articles): # use the best performing models from the validation paper (in draft...) ensemble_type = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True # require a title and abstract. ptyp optional if not (all( ("ti" in article and "ab" in article for article in articles))): raise Exception("RCT robot requires a full title and abstract") prepared_data = [{ "title": r['ti'], "abstract": r['ab'], "ptyp": r.get('ptyp'), "use_ptyp": "ptyp" in r } for r in articles] preds = self.predict(prepared_data, ensemble_type=ensemble_type, threshold_type=threshold_class, auto_use_ptyp=auto_use_ptyp) return preds def pdf_annotate(self, data): # use the best performing models from the validation paper (in draft...) ensemble_type = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return data preds = self.predict({ "title": ti, "abstract": ab }, auto_use_ptyp=False)[0] structured_data = { "is_rct": preds['is_rct'], "decision_score": preds['threshold_value'], "model_class": preds['model'], "threshold_type": preds['threshold_type'] } data.ml["rct"] = structured_data return data def predict(self, X, get_raw=False, ensemble_type="svm_cnn", threshold_type="sensitive", auto_use_ptyp=True): if isinstance(X, dict): X = [X] if auto_use_ptyp: pt_mask = np.array([self._process_ptyp(r) for r in X]) else: # don't add for any of them pt_mask = np.array([-1 for r in X]) preds_l = {} # calculate ptyp for all ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean'] ) / self.constants['scales']['ptyp']['std'] # but set to 0 if not using ptyp_scale[pt_mask == -1] = 0 preds_l['ptyp'] = ptyp_scale # thresholds vary per article thresholds_all = {k: [] for k in ['precise', 'balanced', 'sensitive']} for t in ['precise', 'balanced', 'sensitive']: for r in pt_mask: if r != -1: thresholds_all[t].append(self.constants['thresholds'][ "{}_ptyp".format(ensemble_type)][t]) else: thresholds_all[t].append( self.constants['thresholds'][ensemble_type][t]) X_ti_str = [article.get('title', '') for article in X] X_ab_str = [ '{}\n\n{}'.format(article.get('title', ''), article.get('abstract', '')) for article in X ] if "svm" in ensemble_type: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti])) svm_scale = (svm_preds - self.constants['scales']['svm']['mean'] ) / self.constants['scales']['svm']['std'] preds_l['svm'] = svm_scale preds_l['svm_ptyp'] = preds_l['svm'] + preds_l['ptyp'] preds_l['svm_raw'] = svm_preds.tolist() if "cnn" in ensemble_type: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [] for i, clf in enumerate(self.cnn_clfs): cnn_preds.append(clf.predict(X_cnn).T[0]) cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean'] ) / self.constants['scales']['cnn']['std'] preds_l['cnn'] = np.mean(cnn_scale, axis=0) preds_l['cnn_raw'] = cnn_preds.T.tolist() preds_l['cnn_ptyp'] = preds_l['cnn'] + preds_l['ptyp'] if ensemble_type == "svm_cnn": weights = [self.constants['scales']['svm']['weight'] ] + ([self.constants['scales']['cnn']['weight']] * len(self.cnn_clfs)) preds_l['svm_cnn'] = np.average(np.vstack([svm_scale, cnn_scale]), axis=0, weights=weights) preds_l['svm_cnn_ptyp'] = preds_l['svm_cnn'] + preds_l['ptyp'] # if svm_cnn then we can have probabilities X_calib = np.hstack([ svm_preds.reshape(-1, 1), cnn_preds.T, pt_mask.reshape(-1, 1) ]) probs = [] for r in X_calib: if r[11] != -1: probs.append(self.calibration_lr['svm_cnn'].predict_proba( [r[:11]])[0][1]) else: probs.append( self.calibration_lr['svm_cnn_ptyp'].predict_proba( [r])[0][1]) preds_l['probability'] = probs if get_raw: return {"svm": svm_preds, "cnn": cnn_preds, "ptyp": pt_mask} preds_d = [dict(zip(preds_l, i)) for i in zip(*preds_l.values())] out = [] thresholds_T = [ dict(zip(thresholds_all, t)) for t in zip(*thresholds_all.values()) ] # i.e. https://stackoverflow.com/questions/5558418/list-of-dicts-to-from-dict-of-lists for pred, threshold, used_ptyp in zip(preds_d, thresholds_T, pt_mask): row = {} if used_ptyp != -1: row['model'] = "{}_ptyp".format(ensemble_type) else: row['model'] = ensemble_type row['score'] = float(pred[row['model']]) row['threshold_type'] = threshold_type row['threshold_value'] = float(threshold[threshold_type]) row['is_rct'] = bool(row['score'] >= threshold[threshold_type]) row['is_rct_precise'] = bool(row['score'] >= threshold['precise']) row['is_rct_balanced'] = bool( row['score'] >= threshold['balanced']) row['is_rct_sensitive'] = bool( row['score'] >= threshold['sensitive']) row['ptyp_rct'] = int(used_ptyp) out.append(row) return out @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia def predict_ris(self, ris_data, ensemble_type="svm_cnn", threshold_type='sensitive', auto_use_ptyp=False): simplified = [ris.simplify(article) for article in ris_data] preds = self.predict(simplified, ensemble_type=ensemble_type, threshold_type=threshold_type, auto_use_ptyp=auto_use_ptyp) return preds def filter_articles(self, ris_string, ensemble_type="svm_cnn", threshold_type='sensitive', auto_use_ptyp=True, remove_non_rcts=True): print('Parsing RIS data') ris_data = ris.loads(ris_string) import json with open("debug.json", 'w') as f: json.dumps(ris_data) preds = self.predict_ris(ris_data, ensemble_type=ensemble_type, threshold_type=threshold_type, auto_use_ptyp=auto_use_ptyp) out = [] pred_key_map = { "score": "ZS", "model": "ZM", "threshold_type": "ZT", "threshold_value": "ZC", "is_rct": "ZR", "ptyp_rct": "ZP" } for ris_row, pred_row in zip(ris_data, preds): if remove_non_rcts == False or pred_row['is_rct']: ris_row.update( {pred_key_map[k]: v for k, v in pred_row.items()}) out.append(ris_row) return ris.dumps(out)
class BiasRobot: def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting'] self.top_k = top_k def pdf_annotate(self, data, top_k=None): """ Annotate full text of clinical trial report `top_k` can be overridden here, else defaults to the class default set in __init__ """ if top_k is None: top_k = self.top_k doc_text = data.get('parsed_text') if not doc_text: # we've got to know the text at least.. return data doc_len = len(data['text']) marginalia = [] doc_sents = [sent.text for sent in doc_text.sents] doc_sent_start_i = [sent.start_char for sent in doc_text.sents] doc_sent_end_i = [sent.end_char for sent in doc_text.sents] structured_data = [] for domain in self.bias_domains: doc_domains = [domain] * len(doc_sents) doc_X_i = zip(doc_sents, doc_domains) # # build up sentence feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs(doc_sents) # uni-bigrams/domain interactions self.vec.builder_add_docs(doc_X_i) doc_sents_X = self.vec.builder_transform() doc_sents_preds = self.sent_clf.decision_function(doc_sents_X) high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices] high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices] high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices] high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i] high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i] high_prob_sents_j = " ".join(high_prob_sents) sent_domain_interaction = "-s-" + domain # # build up document feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs([doc_text.text]) # uni-bigrams/domain interaction self.vec.builder_add_docs([(doc_text.text, domain)]) # uni-bigrams/relevance interaction self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)]) X = self.vec.builder_transform() bias_pred = self.doc_clf.predict(X) bias_class = ["high/unclear", "low"][bias_pred[0]] annotation_metadata = [{"content": sent[0], "position": sent[1], "uuid": str(uuid.uuid1()), "prefix": sent[2], "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i, high_prob_prefixes, high_prob_suffixes)] structured_data.append({ "domain": domain, "judgement": bias_class, "annotations": annotation_metadata}) data.ml["bias"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [] for row in data['bias']: marginalia.append({ "type": "Risk of Bias", "title": row['domain'], "annotations": row['annotations'], "description": "**Overall risk of bias prediction**: {}".format(row['judgement']) }) return marginalia @staticmethod def get_domains(): return [u'Random sequence generation', u'Allocation concealment', u'Blinding of participants and personnel', u'Blinding of outcome assessment', u'Incomplete outcome data', u'Selective reporting']
class RCTRobot: def __init__(self): self.svm_clf = MiniClassifier( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz')) cnn_weight_files = glob.glob( os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5')) self.cnn_clfs = [ load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files ] self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english') self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join( robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english') with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f: self.constants = json.load(f) def _process_ptyp(self, data_row, strict=True): """ Takes in a data row which might include rct_ptyp or ptyp fields. If strict=True, then raises exception when passed any contradictory data Returns: 1 = ptyp is RCT 0 = ptyp is NOT RCT -1 = no ptyp information present """ if data_row['use_ptyp'] == False: return -1 elif data_row['use_ptyp'] == True: return 1 if any( (tag in data_row['ptyp'] for tag in ["Randomized Controlled Trial", "D016449"])) else 0 else: raise Exception("unexpcted value for 'use_ptyp'") ### # Annotate function ## def annotate(self, data, filename): # use the best performing models from the validation paper (in draft...) # initialize empty output structure structured_data = { "filename": filename, "is_rct": -1, "decision_score": -1 } filter_class = "svm_cnn" threshold_class = "balanced" auto_use_ptyp = True if data.get("abstract") is not None and data.get("title") is not None: ti = data["title"] ab = data["abstract"] elif data.get("parsed_text") is not None: # then just use the start of the document TI_LEN = 30 AB_LEN = 500 # best guesses based on sample of RCT abstracts + aiming for 95% centile ti = data['parsed_text'][:TI_LEN].text ab = data['parsed_text'][:AB_LEN].text else: # else can't proceed return structured_data preds = self.predict({ "title": ti, "abstract": ab }, auto_use_ptyp=False)[0] structured_data.update({ "is_rct": preds['is_rct'], "decision_score": preds['threshold_value'] }) print(structured_data) return structured_data ### # Predict function ## def predict(self, X, filter_class="svm", filter_type="sensitive", auto_use_ptyp=True): if isinstance(X, dict): X = [X] if auto_use_ptyp: pt_mask = np.array([self._process_ptyp(r) for r in X]) else: # don't add for any of them pt_mask = np.array([-1 for r in X]) # calculate ptyp for all #ptyp = np.copy(pt_mask) # ptyp = np.array([(article.get('rct_ptyp')==True)*1. for article in X]) ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean'] ) / self.constants['scales']['ptyp']['std'] # but set to 0 if not using ptyp_scale[pt_mask == -1] = 0 # thresholds vary per article thresholds = [] for r in pt_mask: if r != -1: thresholds.append(self.constants['thresholds'][ "{}_ptyp".format(filter_class)][filter_type]) else: thresholds.append( self.constants['thresholds'][filter_class][filter_type]) X_ti_str = [article.get('title', '') for article in X] X_ab_str = [ '{}\n\n{}'.format(article.get('title', ''), article.get('abstract', '')) for article in X ] if "svm" in filter_class: X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str)) X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str)) svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti])) svm_scale = (svm_preds - self.constants['scales']['svm']['mean'] ) / self.constants['scales']['svm']['std'] if "cnn" in filter_class: X_cnn = self.cnn_vectorizer.transform(X_ab_str) cnn_preds = [] for i, clf in enumerate(self.cnn_clfs): print('\t{} of {}'.format(i + 1, len(self.cnn_clfs))) cnn_preds.append(clf.predict(X_cnn).T[0]) cnn_preds = np.vstack(cnn_preds) cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean'] ) / self.constants['scales']['cnn']['std'] if filter_class == "svm": y_preds = svm_scale elif filter_class == "cnn": y_preds = np.mean(cnn_scale, axis=0) elif filter_class == "svm_cnn": weights = [self.constants['scales']['svm']['weight'] ] + ([self.constants['scales']['cnn']['weight']] * len(self.cnn_clfs)) y_preds = np.average(np.vstack([svm_scale, cnn_scale]), axis=0, weights=weights) y_preds += ptyp_scale out = [] for pred, threshold, used_ptyp in zip(y_preds, thresholds, pt_mask): row = {} row['score'] = float(pred) if used_ptyp != -1: row['model'] = "{}_ptyp".format(filter_class) else: row['model'] = filter_class row['threshold_type'] = filter_type row['threshold_value'] = float(threshold) row['is_rct'] = bool(pred >= threshold) row['ptyp_rct'] = int(used_ptyp) out.append(row) return out @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [{ "type": "Trial Design", "title": "Is an RCT?", "annotations": [], "description": "{0} (Decision score={1:0.2f} using {} model)".format( data["rct"]["is_rct"], data["rct"]["decision_score"], data["rct"]["model_class"]) }] return marginalia
class BiasAbRobot: def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.doc_clf = MiniClassifier( robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz'))) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2)) self.bias_domains = [ 'random_sequence_generation', 'allocation_concealment', 'blinding_participants_personnel' ] self.top_k = top_k def api_annotate(self, articles, top_k=None): """ Annotate full text of clinical trial report `top_k` can be overridden here, else defaults to the class default set in __init__ """ if not all( ('ab' in article) and ('ti' in article) for article in articles): raise Exception( 'Abstract bias model requires titles and abstracts to be able to complete annotation' ) if top_k is None: top_k = self.top_k out = [] for article in articles: doc_text = article['ti'] + " " + article['ab'] row = {} for domain in self.bias_domains: # # build up document feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs([doc_text]) # uni-bigrams/domain interaction self.vec.builder_add_docs([(doc_text, domain)]) x = self.vec.builder_transform() bias_pred = self.doc_clf.predict(x) bias_class = ["high/unclear", "low"][bias_pred[0]] row[domain] = {"judgement": bias_class} out.append(row) return out