def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2),
                                     n_features=2**26)

        self.bias_domains = [
            'Random sequence generation', 'Allocation concealment',
            'Blinding of participants and personnel',
            'Blinding of outcome assessment', 'Incomplete outcome data',
            'Selective reporting'
        ]

        self.top_k = top_k
예제 #2
0
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default

        """
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {
            'RSG': 'Random sequence generation',
            'AC': 'Allocation concealment',
            'BPP': 'Blinding of participants and personnel',
            'BOA': 'Blinding of outcome assessment',
            'IOD': 'Incomplete outcome data',
            'SR': 'Selective reporting'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()
        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))
            self.CNN_models[bias_domain] = RationaleCNN(
                preprocessor,
                document_model_architecture_path=arch_loc,
                document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(
            robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None,
                                            non_negative=True,
                                            binary=True,
                                            ngram_range=(1, 2),
                                            n_features=2**26)
예제 #3
0
 def test_init(self):
     ''' test for ModularVectorizer.__init__() '''
     m = ModularVectorizer(norm=None,
                           non_negative=True,
                           binary=True,
                           ngram_range=(1, 2),
                           n_features=2**26)
     self.assertTrue(m.vec is not None)
     self.assertEqual(type(m.vec), InteractionHashingVectorizer)
예제 #4
0
class TestModularVectorizer(unittest.TestCase):

    util = Utilities()
    m = ModularVectorizer(norm=None,
                          non_negative=True,
                          binary=True,
                          ngram_range=(1, 2),
                          n_features=2**26)

    def test_init(self):
        ''' test for ModularVectorizer.__init__() '''
        m = ModularVectorizer(norm=None,
                              non_negative=True,
                              binary=True,
                              ngram_range=(1, 2),
                              n_features=2**26)
        self.assertTrue(m.vec is not None)
        self.assertEqual(type(m.vec), InteractionHashingVectorizer)

    def test_combine_matrices(self):
        ''' test for ModularVectorizer.combine_matrices(X_part) '''
        self.m.builder_clear()
        X_part = self.util.load_sparse_csr("vec_builder.npz")
        self.m._combine_matrices(X_part)
        X_part.data.fill(1)
        self.assertEqual((X_part != self.m.X).nnz, 0)
        X_part2 = self.util.load_sparse_csr("vec_builder.npz")
        self.m._combine_matrices(X_part2)
        save = X_part + X_part2
        self.assertEqual((save != self.m.X).nnz, 0)

    def test_builder_clear(self):
        ''' test for ModularVectorizer.builder_clear() '''
        self.m.builder_clear()
        self.assertTrue(self.m.X is None)
        self.m.X = ["anything"]
        self.m.builder_clear()
        self.assertTrue(self.m.X is None)

    def test_builder_add_docs(self):
        ''' test for ModularVectorizer.builder_add_docs() '''
        self.m.builder_clear()
        with open(ex_path + "vector_si.json") as data:
            data = json.load(data)
        X_si = [(data["X_si0"], data["X_si1"])]
        self.assertEqual(self.m.X, None)
        self.m.builder_add_docs(X_si)
        self.assertTrue(self.m.X is not None)

    def test_builder_transform(self):
        ''' test for  ModularVectorizer.builder_transform '''
        self.m.builder_clear()
        self.assertEqual(self.m.builder_transform(), None)
        self.m.X = ["anything"]
        self.assertTrue(self.m.builder_transform(), ["anything"])
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k
예제 #6
0
    def __init__(self, top_k=None):
        
        self.bias_domains = ['Random sequence generation']
        self.top_k = top_k

        self.bias_domains = {'RSG': 'Random sequence generation',
                             'AC': 'Allocation concealment',
                             'BPP': 'Blinding of participants and personnel',
                             'BOA': 'Blinding of outcome assessment'
        }

        ###
        # Here we take a simple ensembling approach in which we combine the
        # predictions made by our rationaleCNN model and the JAMIA (linear)
        # multi task variant.
        ###

        self.all_domains = ['RSG', 'AC', 'BPP', 'BOA']

        # CNN domains
        vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle'
        arch_str = 'robotreviewer/data/keras/models/{}.json'
        weight_str = 'robotreviewer/data/keras/models/{}.hdf5'
        self.CNN_models = OrderedDict()

        for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']:
            # Load vectorizer and keras model
            vectorizer_loc = vectorizer_str.format(bias_domain)
            arch_loc = arch_str.format(bias_domain)
            weight_loc = weight_str.format(bias_domain)
            preprocessor = pickle.load(open(vectorizer_loc, 'rb'))

            preprocessor.tokenizer.oov_token = None

            self.CNN_models[bias_domain] = RationaleCNN(preprocessor,
                                                    document_model_architecture_path=arch_loc,
                                                    document_model_weights_path=weight_loc)

        # Linear domains (these are joint models!)
        self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))
        self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2),
                                                n_features=2**26)
예제 #7
0
class BiasRobot:

    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """


        self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz'))
        self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz'))

        self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)

        self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting']

        self.top_k = top_k

    def pdf_annotate(self, data, top_k=None):

        """
        Annotate full text of clinical trial report
        `top_k` can be overridden here, else defaults to the class
        default set in __init__
        """
        if top_k is None:
            top_k = self.top_k


        doc_text = data.get('parsed_text')

        if not doc_text:
            # we've got to know the text at least..
            return data

        doc_len = len(data['text'])



        marginalia = []

        doc_sents = [sent.text for sent in doc_text.sents]
        doc_sent_start_i = [sent.start_char for sent in doc_text.sents]
        doc_sent_end_i = [sent.end_char for sent in doc_text.sents]

        structured_data = []

        for domain in self.bias_domains:

            doc_domains = [domain] * len(doc_sents)
            doc_X_i = zip(doc_sents, doc_domains)

            #
            # build up sentence feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs(doc_sents)

            # uni-bigrams/domain interactions
            self.vec.builder_add_docs(doc_X_i)

            doc_sents_X = self.vec.builder_transform()
            doc_sents_preds = self.sent_clf.decision_function(doc_sents_X)

            high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first
            high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices]
            high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices]
            high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices]
            high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i]
            high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i]
            high_prob_sents_j = " ".join(high_prob_sents)
            sent_domain_interaction = "-s-" + domain

            #
            # build up document feature set
            #
            self.vec.builder_clear()

            # uni-bigrams
            self.vec.builder_add_docs([doc_text.text])

            # uni-bigrams/domain interaction
            self.vec.builder_add_docs([(doc_text.text, domain)])

            # uni-bigrams/relevance interaction
            self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)])

            X = self.vec.builder_transform()

            bias_pred = self.doc_clf.predict(X)
            bias_class = ["high/unclear", "low"][bias_pred[0]]
            annotation_metadata = [{"content": sent[0],
                                    "position": sent[1],
                                    "uuid": str(uuid.uuid1()),
                                    "prefix": sent[2],
                                    "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i,
                                       high_prob_prefixes,
                                       high_prob_suffixes)]

            structured_data.append({
                "domain": domain,
                "judgement": bias_class,
                "annotations": annotation_metadata})
        data.ml["bias"] = structured_data
        return data

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = []

        for row in data['bias']:
            marginalia.append({
                        "type": "Risk of Bias",
                        "title": row['domain'],
                        "annotations": row['annotations'],
                        "description": "**Overall risk of bias prediction**: {}".format(row['judgement'])
                        })
        return marginalia

    @staticmethod
    def get_domains():
        return [u'Random sequence generation',
                u'Allocation concealment',
                u'Blinding of participants and personnel',
                u'Blinding of outcome assessment',
                u'Incomplete outcome data',
                u'Selective reporting']
class BiasAbRobot:
    def __init__(self, top_k=3):
        """
        `top_k` refers to 'top-k recall'.

        top-1 recall will return the single most relevant sentence
        in the document, and top-3 recall the 3 most relevant.

        The validation study assessed the accuracy of top-3 and top-1
        and we suggest top-3 as default
        """

        self.doc_clf = MiniClassifier(
            robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz')))
        self.vec = ModularVectorizer(norm=None,
                                     non_negative=True,
                                     binary=True,
                                     ngram_range=(1, 2))
        self.bias_domains = [
            'random_sequence_generation', 'allocation_concealment',
            'blinding_participants_personnel'
        ]
        self.top_k = top_k

    def api_annotate(self, articles, top_k=None):
        """
        Annotate full text of clinical trial report
        `top_k` can be overridden here, else defaults to the class
        default set in __init__
        """
        if not all(
            ('ab' in article) and ('ti' in article) for article in articles):
            raise Exception(
                'Abstract bias model requires titles and abstracts to be able to complete annotation'
            )

        if top_k is None:
            top_k = self.top_k

        out = []

        for article in articles:
            doc_text = article['ti'] + "  " + article['ab']
            row = {}
            for domain in self.bias_domains:

                #
                # build up document feature set
                #
                self.vec.builder_clear()

                # uni-bigrams
                self.vec.builder_add_docs([doc_text])

                # uni-bigrams/domain interaction
                self.vec.builder_add_docs([(doc_text, domain)])
                x = self.vec.builder_transform()
                bias_pred = self.doc_clf.predict(x)
                bias_class = ["high/unclear", "low"][bias_pred[0]]
                row[domain] = {"judgement": bias_class}
            out.append(row)
        return out