def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2),
                                     n_features=2**26)

        self.bias_domains = [
            'Random sequence generation', 'Allocation concealment',
            'Blinding of participants and personnel',
            'Blinding of outcome assessment', 'Incomplete outcome data',
            'Selective reporting'
        ]

        self.top_k = top_k
예제 #2
0
    def __init__(self):
        self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))

        cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        json_filename = os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_structure.json')
        self.cnn_clfs = [get_model(json_filename, cnn_weight_file) for cnn_weight_file in cnn_weight_files]
        self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck'))




        self.scale_constants =  {'cnn': {'mean': 0.15592811611054261,
                      'std': 0.22405916984696986,
                      'weight': 1.6666666666666667},
                     'ptyp': {'mean': 0.055155532891381948,
                      'std': 0.22828359573751594},
                     'svm': {'mean': -0.75481403525485891,
                      'std': 0.7812955939364481,
                      'weight': 10.0}} # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models)

        self.thresholds = {'cnn': {'precise': 2.1340457758193034,
              'sensitive': -0.076709540491855063},
             'cnn_ptyp': {'precise': 3.529609848417909,
              'sensitive': 0.083502632442633312},
             'svm': {'precise': 1.9185522606237164,
              'sensitive': 0.093273630980694439},
             'svm_cnn': {'precise': 1.8749128673557529,
              'sensitive': 0.064481902000491614},
             'svm_cnn_ptyp': {'precise': 3.7674045603568755,
              'sensitive': 0.1952449060483534},
             'svm_ptyp': {'precise': 3.7358855328111837,
              'sensitive': 0.42992224964656178}}# All precise models have been calibrated to 97.6% sensitivity
예제 #3
0
 def __init__(self):
     from keras.preprocessing import sequence
     from keras.models import load_model
     from keras.models import Sequential
     from keras.preprocessing import sequence
     from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
     from keras.layers import Embedding
     from keras.layers import Convolution1D, MaxPooling1D
     from keras import backend as K
     from keras.models import Model
     from keras.regularizers import l2
     global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
     global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
     self.svm_clf = MiniClassifier(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
     cnn_weight_files = glob.glob(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
     self.cnn_clfs = [
         load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
     ]
     self.svm_vectorizer = HashingVectorizer(binary=False,
                                             ngram_range=(1, 1),
                                             stop_words='english')
     self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
         robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                           stop_words='english')
     with open(
             os.path.join(robotreviewer.DATA_ROOT,
                          'rct/rct_model_calibration.json'), 'r') as f:
         self.constants = json.load(f)
예제 #4
0
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default

        """
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {
            'RSG': 'Random sequence generation',
            'AC': 'Allocation concealment',
            'BPP': 'Blinding of participants and personnel',
            'BOA': 'Blinding of outcome assessment',
            'IOD': 'Incomplete outcome data',
            'SR': 'Selective reporting'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()
        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))
            self.CNN_models[bias_domain] = RationaleCNN(
                preprocessor,
                document_model_architecture_path=arch_loc,
                document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None,
                                            non_negative=True,
                                            binary=True,
                                            ngram_range=(1, 2),
                                            n_features=2**26)
예제 #5
0
    def __init__(self, top_k=2, min_k=1):
        """
        In most cases, a fixed number of sentences (top_k) will be
        returned for each document, *except* when the decision
        scores are below a threshold (i.e. the implication being
        that none of the sentences are relevant).

        top_k = the default number of sentences to retrive per
                document
        min_k = ensure that at at least min_k sentences are
                always returned


        """

        logging.debug("Loading PICO classifiers")

        self.P_clf = MiniClassifier(robotreviewer.get_data("pico/P_model.npz"))
        self.I_clf = MiniClassifier(robotreviewer.get_data("pico/I_model.npz"))
        self.O_clf = MiniClassifier(robotreviewer.get_data("pico/O_model.npz"))

        logging.debug("PICO classifiers loaded")

        logging.debug("Loading IDF weights")
        with open(robotreviewer.get_data("pico/P_idf.npz"), 'rb') as f:
            self.P_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/I_idf.npz"), 'rb') as f:
            self.I_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        with open(robotreviewer.get_data("pico/O_idf.npz"), 'rb') as f:
            self.O_idf = diags(
                np.load(f, allow_pickle=True,
                        encoding='latin1').item().todense().A1, 0)

        logging.debug("IDF weights loaded")

        self.vec = PICO_vectorizer()
        self.models = [self.P_clf, self.I_clf, self.O_clf]
        self.idfs = [self.P_idf, self.I_idf, self.O_idf]
        self.PICO_domains = ["Population", "Intervention", "Outcomes"]

        # if config.USE_METAMAP:
        #     self.metamap = MetaMap.get_instance()

        self.top_k = top_k
        self.min_k = min_k
예제 #6
0
 def __init__(self):
     self.svm_clf = MiniClassifier(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
     cnn_weight_files = glob.glob(
         os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
     self.cnn_clfs = [
         load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
     ]
     self.svm_vectorizer = HashingVectorizer(binary=False,
                                             ngram_range=(1, 1),
                                             stop_words='english')
     self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
         robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                           stop_words='english')
     with open(
             os.path.join(robotreviewer.DATA_ROOT,
                          'rct/rct_model_calibration.json'), 'r') as f:
         self.constants = json.load(f)
예제 #7
0
    def __init__(self, top_k=None):
        
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {'RSG': 'Random sequence generation',
                             'AC': 'Allocation concealment',
                             'BPP': 'Blinding of participants and personnel',
                             'BOA': 'Blinding of outcome assessment'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()

        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))

            preprocessor.tokenizer.oov_token = None

            self.CNN_models[bias_domain] = RationaleCNN(preprocessor,
                                                    document_model_architecture_path=arch_loc,
                                                    document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2),
                                                n_features=2**26)
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k
예제 #9
0
class TestMiniClassifier(unittest.TestCase):

    doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
    util = Utilities()

    def test_init(self):
        ''' test for MiniClassifier.__init__() '''
        self.assertTrue(isinstance(self.doc_clf.coef, np.ndarray))
        self.assertTrue(isinstance(self.doc_clf.intercept, float))

    def test_decision_function(self):
        ''' test for MiniClassifier.decision_function(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        dec = self.doc_clf.decision_function(X)  # [ 1.50563252]
        decTest = np.float64([1.50563252])
        ''' can't do:
            print(np.array_equal(dec, y))
            print(np.array_equiv(dec, y))
            since as decimals these will not pass
        '''
        self.assertTrue(np.allclose(dec, decTest))

    def test_predict(self):
        ''' test for MiniClassifier.predict(X) '''
        X = self.util.load_sparse_csr("X_data.npz")
        pred = self.doc_clf.predict(X)  # [1]
        self.assertEqual(pred, np.int(1))

    def test_predict_proba(self):
        ''' tests for MiniClassifier.predict_proba(X) '''
        with open(ex_path + "rationale_robot_data.json", "r",
                  encoding="utf-8") as data:
            data = json.load(data)
        bpl = data["bias_prob_linear"]
        X = self.util.load_sparse_csr("X_data.npz")
        bpl_test = self.doc_clf.predict_proba(X)[0]
        self.assertTrue(abs(bpl - bpl_test) < 0.01)
class RCTRobot:
    def __init__(self):
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))

        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        json_filename = os.path.join(robotreviewer.DATA_ROOT,
                                     'rct/rct_cnn_structure.json')
        self.cnn_clfs = [
            get_model(json_filename, cnn_weight_file)
            for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck'))

        self.scale_constants = {
            'cnn': {
                'mean': 0.15592811611054261,
                'std': 0.22405916984696986,
                'weight': 1.6666666666666667
            },
            'ptyp': {
                'mean': 0.055155532891381948,
                'std': 0.22828359573751594
            },
            'svm': {
                'mean': -0.75481403525485891,
                'std': 0.7812955939364481,
                'weight': 10.0
            }
        }  # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models)

        self.thresholds = {
            'cnn': {
                'precise': 2.1340457758193034,
                'sensitive': -0.076709540491855063
            },
            'cnn_ptyp': {
                'precise': 3.529609848417909,
                'sensitive': 0.083502632442633312
            },
            'svm': {
                'precise': 1.9185522606237164,
                'sensitive': 0.093273630980694439
            },
            'svm_cnn': {
                'precise': 1.8749128673557529,
                'sensitive': 0.064481902000491614
            },
            'svm_cnn_ptyp': {
                'precise': 3.7674045603568755,
                'sensitive': 0.1952449060483534
            },
            'svm_ptyp': {
                'precise': 3.7358855328111837,
                'sensitive': 0.42992224964656178
            }
        }  # All precise models have been calibrated to 97.6% sensitivity
        # All sensitive models have been calibrated to 99.1% sensitivity

    def annotate(self, data):

        # use the best performing models from the validation paper (in draft...)
        filter_class = "svm_cnn_ptyp"
        threshold_class = "precise"

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return data

        if "pubmed" in data.data:
            ptyp = 1.0
        else:
            ptyp = 0.0

        X_ti_str = [ti]
        X_ab_str = ['{}\n\n{}'.format(ti, ab)]

        if "svm" in filter_class:

            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))

            svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab]))
            svm_scale = (svm_preds - self.scale_constants['svm']['mean']
                         ) / self.scale_constants['svm']['std']

        if "ptyp" in filter_class:
            ptyp = np.array([ptyp])
            ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean']
                          ) / self.scale_constants['ptyp']['std']

        if "cnn" in filter_class:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs]
            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean']
                         ) / self.scale_constants['cnn']['std']

        if filter_class == "svm":
            y_preds = svm_scale
        elif filter_class == "svm_ptyp":
            y_preds = svm_scale + ptyp_scale
        elif filter_class == "ptyp":
            y_preds = ptyp_scale
        elif filter_class == "svm_cnn_ptyp":
            weights = [self.scale_constants['svm']['weight']] + (
                [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs))
            y_preds = np.average(np.vstack([cnn_scale, svm_scale]),
                                 axis=0,
                                 weights=weights) + ptyp_scale

        structured_data = {
            "is_rct":
            bool(y_preds[0] > self.thresholds[filter_class][threshold_class]),
            "decision_score":
            y_preds[0],
            "model_class":
            filter_class
        }

        data.ml["rct"] = structured_data
        return data

        @staticmethod
        def get_marginalia(data):
            """
            Get marginalia formatted for Spa from structured data
            """
            marginalia = [{
                "type":
                "Trial Design",
                "title":
                "Is an RCT?",
                "annotations": [],
                "description":
                "{0} (Decision score={1:0.2f} using {} model)".format(
                    data["rct"]["is_rct"], data["rct"]["decision_score"],
                    data["rct"]["model_class"])
            }]
            return marginalia
예제 #11
0
class RCTRobot:
    def __init__(self):
        from keras.preprocessing import sequence
        from keras.models import load_model
        from keras.models import Sequential
        from keras.preprocessing import sequence
        from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        from keras.layers import Embedding
        from keras.layers import Convolution1D, MaxPooling1D
        from keras import backend as K
        from keras.models import Model
        from keras.regularizers import l2
        global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [
            load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                              stop_words='english')
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

        self.calibration_lr = {}
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f)

        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/svm_cnn_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn'] = pickle.load(f)

    def _process_ptyp(self, data_row, strict=True):
        """
        Takes in a data row which might include rct_ptyp
        or ptyp fields.
        If strict=True, then raises exception when passed any
        contradictory data
        Returns: 1 = ptyp is RCT
                 0 = ptyp is NOT RCT
                 -1 = no ptyp information present
        """
        if data_row['use_ptyp'] == False:
            return -1
        elif data_row['use_ptyp'] == True:
            return 1 if any(
                (tag in data_row['ptyp']
                 for tag in ["Randomized Controlled Trial", "D016449"])) else 0
        else:
            raise Exception("unexpected value for 'use_ptyp'")

    def api_annotate(self, articles):

        # use the best performing models from the validation paper (in draft...)
        ensemble_type = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        # require a title and abstract. ptyp optional

        if not (all(
            ("ti" in article and "ab" in article for article in articles))):
            raise Exception("RCT robot requires a full title and abstract")

        prepared_data = [{
            "title": r['ti'],
            "abstract": r['ab'],
            "ptyp": r.get('ptyp'),
            "use_ptyp": "ptyp" in r
        } for r in articles]
        preds = self.predict(prepared_data,
                             ensemble_type=ensemble_type,
                             threshold_type=threshold_class,
                             auto_use_ptyp=auto_use_ptyp)

        return preds

    def pdf_annotate(self, data):

        # use the best performing models from the validation paper (in draft...)
        ensemble_type = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return data

        preds = self.predict({
            "title": ti,
            "abstract": ab
        },
                             auto_use_ptyp=False)[0]

        structured_data = {
            "is_rct": preds['is_rct'],
            "decision_score": preds['threshold_value'],
            "model_class": preds['model'],
            "threshold_type": preds['threshold_type']
        }

        data.ml["rct"] = structured_data
        return data

    def predict(self,
                X,
                get_raw=False,
                ensemble_type="svm_cnn",
                threshold_type="sensitive",
                auto_use_ptyp=True):

        if isinstance(X, dict):
            X = [X]

        if auto_use_ptyp:
            pt_mask = np.array([self._process_ptyp(r) for r in X])
        else:
            # don't add for any of them
            pt_mask = np.array([-1 for r in X])

        preds_l = {}
        # calculate ptyp for all
        ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean']
                      ) / self.constants['scales']['ptyp']['std']
        # but set to 0 if not using
        ptyp_scale[pt_mask == -1] = 0
        preds_l['ptyp'] = ptyp_scale

        # thresholds vary per article
        thresholds_all = {k: [] for k in ['precise', 'balanced', 'sensitive']}

        for t in ['precise', 'balanced', 'sensitive']:
            for r in pt_mask:
                if r != -1:
                    thresholds_all[t].append(self.constants['thresholds'][
                        "{}_ptyp".format(ensemble_type)][t])
                else:
                    thresholds_all[t].append(
                        self.constants['thresholds'][ensemble_type][t])

        X_ti_str = [article.get('title', '') for article in X]
        X_ab_str = [
            '{}\n\n{}'.format(article.get('title', ''),
                              article.get('abstract', '')) for article in X
        ]

        if "svm" in ensemble_type:
            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))
            svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti]))
            svm_scale = (svm_preds - self.constants['scales']['svm']['mean']
                         ) / self.constants['scales']['svm']['std']
            preds_l['svm'] = svm_scale
            preds_l['svm_ptyp'] = preds_l['svm'] + preds_l['ptyp']
            preds_l['svm_raw'] = svm_preds.tolist()

        if "cnn" in ensemble_type:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = []
            for i, clf in enumerate(self.cnn_clfs):
                cnn_preds.append(clf.predict(X_cnn).T[0])

            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean']
                         ) / self.constants['scales']['cnn']['std']
            preds_l['cnn'] = np.mean(cnn_scale, axis=0)
            preds_l['cnn_raw'] = cnn_preds.T.tolist()
            preds_l['cnn_ptyp'] = preds_l['cnn'] + preds_l['ptyp']

        if ensemble_type == "svm_cnn":
            weights = [self.constants['scales']['svm']['weight']
                       ] + ([self.constants['scales']['cnn']['weight']] *
                            len(self.cnn_clfs))
            preds_l['svm_cnn'] = np.average(np.vstack([svm_scale, cnn_scale]),
                                            axis=0,
                                            weights=weights)
            preds_l['svm_cnn_ptyp'] = preds_l['svm_cnn'] + preds_l['ptyp']

            # if svm_cnn then we can have probabilities

            X_calib = np.hstack([
                svm_preds.reshape(-1, 1), cnn_preds.T,
                pt_mask.reshape(-1, 1)
            ])
            probs = []

            for r in X_calib:

                if r[11] != -1:
                    probs.append(self.calibration_lr['svm_cnn'].predict_proba(
                        [r[:11]])[0][1])
                else:
                    probs.append(
                        self.calibration_lr['svm_cnn_ptyp'].predict_proba(
                            [r])[0][1])

            preds_l['probability'] = probs

        if get_raw:
            return {"svm": svm_preds, "cnn": cnn_preds, "ptyp": pt_mask}

        preds_d = [dict(zip(preds_l, i)) for i in zip(*preds_l.values())]

        out = []

        thresholds_T = [
            dict(zip(thresholds_all, t)) for t in zip(*thresholds_all.values())
        ]
        # i.e. https://stackoverflow.com/questions/5558418/list-of-dicts-to-from-dict-of-lists

        for pred, threshold, used_ptyp in zip(preds_d, thresholds_T, pt_mask):
            row = {}
            if used_ptyp != -1:
                row['model'] = "{}_ptyp".format(ensemble_type)
            else:
                row['model'] = ensemble_type
            row['score'] = float(pred[row['model']])
            row['threshold_type'] = threshold_type
            row['threshold_value'] = float(threshold[threshold_type])
            row['is_rct'] = bool(row['score'] >= threshold[threshold_type])
            row['is_rct_precise'] = bool(row['score'] >= threshold['precise'])
            row['is_rct_balanced'] = bool(
                row['score'] >= threshold['balanced'])
            row['is_rct_sensitive'] = bool(
                row['score'] >= threshold['sensitive'])
            row['ptyp_rct'] = int(used_ptyp)
            out.append(row)
        return out

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = [{
            "type":
            "Trial Design",
            "title":
            "Is an RCT?",
            "annotations": [],
            "description":
            "{0} (Decision score={1:0.2f} using {} model)".format(
                data["rct"]["is_rct"], data["rct"]["decision_score"],
                data["rct"]["model_class"])
        }]
        return marginalia

    def predict_ris(self,
                    ris_data,
                    ensemble_type="svm_cnn",
                    threshold_type='sensitive',
                    auto_use_ptyp=False):

        simplified = [ris.simplify(article) for article in ris_data]
        preds = self.predict(simplified,
                             ensemble_type=ensemble_type,
                             threshold_type=threshold_type,
                             auto_use_ptyp=auto_use_ptyp)
        return preds

    def filter_articles(self,
                        ris_string,
                        ensemble_type="svm_cnn",
                        threshold_type='sensitive',
                        auto_use_ptyp=True,
                        remove_non_rcts=True):

        print('Parsing RIS data')
        ris_data = ris.loads(ris_string)
        import json
        with open("debug.json", 'w') as f:
            json.dumps(ris_data)
        preds = self.predict_ris(ris_data,
                                 ensemble_type=ensemble_type,
                                 threshold_type=threshold_type,
                                 auto_use_ptyp=auto_use_ptyp)
        out = []

        pred_key_map = {
            "score": "ZS",
            "model": "ZM",
            "threshold_type": "ZT",
            "threshold_value": "ZC",
            "is_rct": "ZR",
            "ptyp_rct": "ZP"
        }

        for ris_row, pred_row in zip(ris_data, preds):
            if remove_non_rcts == False or pred_row['is_rct']:
                ris_row.update(
                    {pred_key_map[k]: v
                     for k, v in pred_row.items()})

                out.append(ris_row)
        return ris.dumps(out)
예제 #12
0
class BiasRobot:

    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """


        self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)

        self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting']

        self.top_k = top_k

    def pdf_annotate(self, data, top_k=None):

        """
        Annotate full text of clinical trial report
        `top_k` can be overridden here, else defaults to the class
        default set in __init__
        """
        if top_k is None:
            top_k = self.top_k


        doc_text = data.get('parsed_text')

        if not doc_text:
            # we've got to know the text at least..
            return data

        doc_len = len(data['text'])



        marginalia = []

        doc_sents = [sent.text for sent in doc_text.sents]
        doc_sent_start_i = [sent.start_char for sent in doc_text.sents]
        doc_sent_end_i = [sent.end_char for sent in doc_text.sents]

        structured_data = []

        for domain in self.bias_domains:

            doc_domains = [domain] * len(doc_sents)
            doc_X_i = zip(doc_sents, doc_domains)

            #
            # build up sentence feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs(doc_sents)

            # uni-bigrams/domain interactions
            self.vec.builder_add_docs(doc_X_i)

            doc_sents_X = self.vec.builder_transform()
            doc_sents_preds = self.sent_clf.decision_function(doc_sents_X)

            high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first
            high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices]
            high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices]
            high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices]
            high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i]
            high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i]
            high_prob_sents_j = " ".join(high_prob_sents)
            sent_domain_interaction = "-s-" + domain

            #
            # build up document feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs([doc_text.text])

            # uni-bigrams/domain interaction
            self.vec.builder_add_docs([(doc_text.text, domain)])

            # uni-bigrams/relevance interaction
            self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)])

            X = self.vec.builder_transform()

            bias_pred = self.doc_clf.predict(X)
            bias_class = ["high/unclear", "low"][bias_pred[0]]
            annotation_metadata = [{"content": sent[0],
                                    "position": sent[1],
                                    "uuid": str(uuid.uuid1()),
                                    "prefix": sent[2],
                                    "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i,
                                       high_prob_prefixes,
                                       high_prob_suffixes)]

            structured_data.append({
                "domain": domain,
                "judgement": bias_class,
                "annotations": annotation_metadata})
        data.ml["bias"] = structured_data
        return data

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = []

        for row in data['bias']:
            marginalia.append({
                        "type": "Risk of Bias",
                        "title": row['domain'],
                        "annotations": row['annotations'],
                        "description": "**Overall risk of bias prediction**: {}".format(row['judgement'])
                        })
        return marginalia

    @staticmethod
    def get_domains():
        return [u'Random sequence generation',
                u'Allocation concealment',
                u'Blinding of participants and personnel',
                u'Blinding of outcome assessment',
                u'Incomplete outcome data',
                u'Selective reporting']
예제 #13
0
class RCTRobot:
    def __init__(self):
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [
            load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'),
                                              stop_words='english')
        with open(
                os.path.join(robotreviewer.DATA_ROOT,
                             'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

    def _process_ptyp(self, data_row, strict=True):
        """
        Takes in a data row which might include rct_ptyp
        or ptyp fields.
        If strict=True, then raises exception when passed any
        contradictory data
        Returns: 1 = ptyp is RCT
                 0 = ptyp is NOT RCT
                 -1 = no ptyp information present
        """
        if data_row['use_ptyp'] == False:
            return -1
        elif data_row['use_ptyp'] == True:
            return 1 if any(
                (tag in data_row['ptyp']
                 for tag in ["Randomized Controlled Trial", "D016449"])) else 0
        else:
            raise Exception("unexpcted value for 'use_ptyp'")

    ###
    # Annotate function
    ##
    def annotate(self, data, filename):

        # use the best performing models from the validation paper (in draft...)

        # initialize empty output structure
        structured_data = {
            "filename": filename,
            "is_rct": -1,
            "decision_score": -1
        }
        filter_class = "svm_cnn"
        threshold_class = "balanced"
        auto_use_ptyp = True

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return structured_data

        preds = self.predict({
            "title": ti,
            "abstract": ab
        },
                             auto_use_ptyp=False)[0]

        structured_data.update({
            "is_rct": preds['is_rct'],
            "decision_score": preds['threshold_value']
        })
        print(structured_data)
        return structured_data

    ###
    # Predict function
    ##
    def predict(self,
                X,
                filter_class="svm",
                filter_type="sensitive",
                auto_use_ptyp=True):

        if isinstance(X, dict):
            X = [X]

        if auto_use_ptyp:
            pt_mask = np.array([self._process_ptyp(r) for r in X])
        else:
            # don't add for any of them
            pt_mask = np.array([-1 for r in X])

        # calculate ptyp for all
        #ptyp = np.copy(pt_mask)
        # ptyp = np.array([(article.get('rct_ptyp')==True)*1. for article in X])
        ptyp_scale = (pt_mask - self.constants['scales']['ptyp']['mean']
                      ) / self.constants['scales']['ptyp']['std']
        # but set to 0 if not using
        ptyp_scale[pt_mask == -1] = 0

        # thresholds vary per article
        thresholds = []
        for r in pt_mask:
            if r != -1:
                thresholds.append(self.constants['thresholds'][
                    "{}_ptyp".format(filter_class)][filter_type])
            else:
                thresholds.append(
                    self.constants['thresholds'][filter_class][filter_type])

        X_ti_str = [article.get('title', '') for article in X]
        X_ab_str = [
            '{}\n\n{}'.format(article.get('title', ''),
                              article.get('abstract', '')) for article in X
        ]

        if "svm" in filter_class:
            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))
            svm_preds = self.svm_clf.decision_function(hstack([X_ab, X_ti]))
            svm_scale = (svm_preds - self.constants['scales']['svm']['mean']
                         ) / self.constants['scales']['svm']['std']

        if "cnn" in filter_class:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = []
            for i, clf in enumerate(self.cnn_clfs):
                print('\t{} of {}'.format(i + 1, len(self.cnn_clfs)))
                cnn_preds.append(clf.predict(X_cnn).T[0])

            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.constants['scales']['cnn']['mean']
                         ) / self.constants['scales']['cnn']['std']

        if filter_class == "svm":
            y_preds = svm_scale
        elif filter_class == "cnn":
            y_preds = np.mean(cnn_scale, axis=0)
        elif filter_class == "svm_cnn":
            weights = [self.constants['scales']['svm']['weight']
                       ] + ([self.constants['scales']['cnn']['weight']] *
                            len(self.cnn_clfs))
            y_preds = np.average(np.vstack([svm_scale, cnn_scale]),
                                 axis=0,
                                 weights=weights)

        y_preds += ptyp_scale

        out = []
        for pred, threshold, used_ptyp in zip(y_preds, thresholds, pt_mask):
            row = {}
            row['score'] = float(pred)
            if used_ptyp != -1:
                row['model'] = "{}_ptyp".format(filter_class)
            else:
                row['model'] = filter_class
            row['threshold_type'] = filter_type
            row['threshold_value'] = float(threshold)
            row['is_rct'] = bool(pred >= threshold)
            row['ptyp_rct'] = int(used_ptyp)
            out.append(row)
        return out

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = [{
            "type":
            "Trial Design",
            "title":
            "Is an RCT?",
            "annotations": [],
            "description":
            "{0} (Decision score={1:0.2f} using {} model)".format(
                data["rct"]["is_rct"], data["rct"]["decision_score"],
                data["rct"]["model_class"])
        }]
        return marginalia
class BiasAbRobot:
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k

    def api_annotate(self, articles, top_k=None):
        """
        Annotate full text of clinical trial report
        `top_k` can be overridden here, else defaults to the class
        default set in __init__
        """
        if not all(
            ('ab' in article) and ('ti' in article) for article in articles):
            raise Exception(
                'Abstract bias model requires titles and abstracts to be able to complete annotation'
            )

        if top_k is None:
            top_k = self.top_k

        out = []

        for article in articles:
            doc_text = article['ti'] + "  " + article['ab']
            row = {}
            for domain in self.bias_domains:

                #
                # build up document feature set
                #
                self.vec.builder_clear()

                # uni-bigrams
                self.vec.builder_add_docs([doc_text])

                # uni-bigrams/domain interaction
                self.vec.builder_add_docs([(doc_text, domain)])
                x = self.vec.builder_transform()
                bias_pred = self.doc_clf.predict(x)
                bias_class = ["high/unclear", "low"][bias_pred[0]]
                row[domain] = {"judgement": bias_class}
            out.append(row)
        return out