def filter_articles(self, ris_string, ensemble_type="svm_cnn", threshold_type='sensitive', auto_use_ptyp=True, remove_non_rcts=True): print('Parsing RIS data') ris_data = ris.loads(ris_string) import json with open("debug.json", 'w') as f: json.dumps(ris_data) preds = self.predict_ris(ris_data, ensemble_type=ensemble_type, threshold_type=threshold_type, auto_use_ptyp=auto_use_ptyp) out = [] pred_key_map = { "score": "ZS", "model": "ZM", "threshold_type": "ZT", "threshold_value": "ZC", "is_rct": "ZR", "ptyp_rct": "ZP" } for ris_row, pred_row in zip(ris_data, preds): if remove_non_rcts == False or pred_row['is_rct']: ris_row.update( {pred_key_map[k]: v for k, v in pred_row.items()}) out.append(ris_row) return ris.dumps(out)
def test_calibration(): print("Testing RobotSearch...") target_classes = ["svm", "cnn", "svm_cnn"] target_modes = ["balanced", "precise", "sensitive"] rct_bot = RCTRobot() print("Loading test PubMed file") with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/pubmed_test.txt'), 'r') as f: ris_string = f.read() print('Parsing RIS data') ris_data = ris.loads(ris_string) print("Loading expected results (from validation paper)") with open( os.path.join(robotreviewer.DATA_ROOT, 'rct/pubmed_expected.json'), 'r') as f: expected_results = json.load(f) for target_class in target_classes: for target_mode in target_modes: for use_ptyp in [True, False]: expected_model_class = "{}_ptyp".format( target_class) if use_ptyp else target_class print("Testing {} model; use_ptyp={}; mode={}".format( target_class, use_ptyp, target_mode)) data = rct_bot.predict_ris(ris_data, ensemble_type=target_class, threshold_type=target_mode, auto_use_ptyp=use_ptyp) exp_pmids = [str(r['PMID'][0]) for r in ris_data] obs_pmids = [ str(r['pmid']) for r in expected_results[expected_model_class][target_mode] ] print("Number matching PMIDS: {}".format( sum([i == j for i, j in zip(exp_pmids, obs_pmids)]))) obs_score = np.array([r['score'] for r in data]) obs_clf = np.array([r['is_rct'] for r in data]) exp_score = np.array([ float(r['score']) for r in expected_results[expected_model_class][target_mode] ]) exp_clf = np.array([ r['is_rct'] for r in expected_results[expected_model_class] [target_mode] ]) print("Totals assessed: {} obs, {} exp".format( len(obs_score), len(exp_score))) match_clf = np.sum(np.equal(obs_clf, exp_clf)) disag = np.where((np.equal(obs_clf, exp_clf) == False))[0] hedges_y = np.array([ r['hedges_is_rct'] == '1' for r in expected_results[expected_model_class][target_mode] ]) exp_sens = np.sum(exp_clf[hedges_y]) / np.sum(hedges_y) exp_spec = np.sum( np.invert(exp_clf)[np.invert(hedges_y)]) / np.sum( np.invert(hedges_y)) obs_sens = np.sum(obs_clf[hedges_y]) / np.sum(hedges_y) obs_spec = np.sum( np.invert(obs_clf)[np.invert(hedges_y)]) / np.sum( np.invert(hedges_y)) print("Expected: sens {} spec {}".format(exp_sens, exp_spec)) print("Observed: sens {} spec {}".format(obs_sens, obs_spec))