def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = [ 'Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting' ] self.top_k = top_k
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = { 'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment', 'IOD': 'Incomplete outcome data', 'SR': 'Selective reporting' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) self.CNN_models[bias_domain] = RationaleCNN( preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier( robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier( robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
def test_init(self): ''' test for ModularVectorizer.__init__() ''' m = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.assertTrue(m.vec is not None) self.assertEqual(type(m.vec), InteractionHashingVectorizer)
class TestModularVectorizer(unittest.TestCase): util = Utilities() m = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) def test_init(self): ''' test for ModularVectorizer.__init__() ''' m = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.assertTrue(m.vec is not None) self.assertEqual(type(m.vec), InteractionHashingVectorizer) def test_combine_matrices(self): ''' test for ModularVectorizer.combine_matrices(X_part) ''' self.m.builder_clear() X_part = self.util.load_sparse_csr("vec_builder.npz") self.m._combine_matrices(X_part) X_part.data.fill(1) self.assertEqual((X_part != self.m.X).nnz, 0) X_part2 = self.util.load_sparse_csr("vec_builder.npz") self.m._combine_matrices(X_part2) save = X_part + X_part2 self.assertEqual((save != self.m.X).nnz, 0) def test_builder_clear(self): ''' test for ModularVectorizer.builder_clear() ''' self.m.builder_clear() self.assertTrue(self.m.X is None) self.m.X = ["anything"] self.m.builder_clear() self.assertTrue(self.m.X is None) def test_builder_add_docs(self): ''' test for ModularVectorizer.builder_add_docs() ''' self.m.builder_clear() with open(ex_path + "vector_si.json") as data: data = json.load(data) X_si = [(data["X_si0"], data["X_si1"])] self.assertEqual(self.m.X, None) self.m.builder_add_docs(X_si) self.assertTrue(self.m.X is not None) def test_builder_transform(self): ''' test for ModularVectorizer.builder_transform ''' self.m.builder_clear() self.assertEqual(self.m.builder_transform(), None) self.m.X = ["anything"] self.assertTrue(self.m.builder_transform(), ["anything"])
def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.doc_clf = MiniClassifier( robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz'))) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2)) self.bias_domains = [ 'random_sequence_generation', 'allocation_concealment', 'blinding_participants_personnel' ] self.top_k = top_k
def __init__(self, top_k=None): self.bias_domains = ['Random sequence generation'] self.top_k = top_k self.bias_domains = {'RSG': 'Random sequence generation', 'AC': 'Allocation concealment', 'BPP': 'Blinding of participants and personnel', 'BOA': 'Blinding of outcome assessment' } ### # Here we take a simple ensembling approach in which we combine the # predictions made by our rationaleCNN model and the JAMIA (linear) # multi task variant. ### self.all_domains = ['RSG', 'AC', 'BPP', 'BOA'] # CNN domains vectorizer_str = 'robotreviewer/data/keras/vectorizers/{}.pickle' arch_str = 'robotreviewer/data/keras/models/{}.json' weight_str = 'robotreviewer/data/keras/models/{}.hdf5' self.CNN_models = OrderedDict() for bias_domain in ['RSG', 'AC', 'BPP', 'BOA']: # Load vectorizer and keras model vectorizer_loc = vectorizer_str.format(bias_domain) arch_loc = arch_str.format(bias_domain) weight_loc = weight_str.format(bias_domain) preprocessor = pickle.load(open(vectorizer_loc, 'rb')) preprocessor.tokenizer.oov_token = None self.CNN_models[bias_domain] = RationaleCNN(preprocessor, document_model_architecture_path=arch_loc, document_model_weights_path=weight_loc) # Linear domains (these are joint models!) self.linear_sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.linear_doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.linear_vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26)
class BiasRobot: def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.sent_clf = MiniClassifier(robotreviewer.get_data('bias/bias_sent_level.npz')) self.doc_clf = MiniClassifier(robotreviewer.get_data('bias/bias_doc_level.npz')) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2), n_features=2**26) self.bias_domains = ['Random sequence generation', 'Allocation concealment', 'Blinding of participants and personnel', 'Blinding of outcome assessment', 'Incomplete outcome data', 'Selective reporting'] self.top_k = top_k def pdf_annotate(self, data, top_k=None): """ Annotate full text of clinical trial report `top_k` can be overridden here, else defaults to the class default set in __init__ """ if top_k is None: top_k = self.top_k doc_text = data.get('parsed_text') if not doc_text: # we've got to know the text at least.. return data doc_len = len(data['text']) marginalia = [] doc_sents = [sent.text for sent in doc_text.sents] doc_sent_start_i = [sent.start_char for sent in doc_text.sents] doc_sent_end_i = [sent.end_char for sent in doc_text.sents] structured_data = [] for domain in self.bias_domains: doc_domains = [domain] * len(doc_sents) doc_X_i = zip(doc_sents, doc_domains) # # build up sentence feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs(doc_sents) # uni-bigrams/domain interactions self.vec.builder_add_docs(doc_X_i) doc_sents_X = self.vec.builder_transform() doc_sents_preds = self.sent_clf.decision_function(doc_sents_X) high_prob_sent_indices = np.argsort(doc_sents_preds)[:-top_k-1:-1] # top k, with no 1 first high_prob_sents = [doc_sents[i] for i in high_prob_sent_indices] high_prob_start_i = [doc_sent_start_i[i] for i in high_prob_sent_indices] high_prob_end_i = [doc_sent_end_i[i] for i in high_prob_sent_indices] high_prob_prefixes = [doc_text.text[max(0, offset-20):offset] for offset in high_prob_start_i] high_prob_suffixes = [doc_text.text[offset: min(doc_len, offset+20)] for offset in high_prob_end_i] high_prob_sents_j = " ".join(high_prob_sents) sent_domain_interaction = "-s-" + domain # # build up document feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs([doc_text.text]) # uni-bigrams/domain interaction self.vec.builder_add_docs([(doc_text.text, domain)]) # uni-bigrams/relevance interaction self.vec.builder_add_docs([(high_prob_sents_j, sent_domain_interaction)]) X = self.vec.builder_transform() bias_pred = self.doc_clf.predict(X) bias_class = ["high/unclear", "low"][bias_pred[0]] annotation_metadata = [{"content": sent[0], "position": sent[1], "uuid": str(uuid.uuid1()), "prefix": sent[2], "suffix": sent[3]} for sent in zip(high_prob_sents, high_prob_start_i, high_prob_prefixes, high_prob_suffixes)] structured_data.append({ "domain": domain, "judgement": bias_class, "annotations": annotation_metadata}) data.ml["bias"] = structured_data return data @staticmethod def get_marginalia(data): """ Get marginalia formatted for Spa from structured data """ marginalia = [] for row in data['bias']: marginalia.append({ "type": "Risk of Bias", "title": row['domain'], "annotations": row['annotations'], "description": "**Overall risk of bias prediction**: {}".format(row['judgement']) }) return marginalia @staticmethod def get_domains(): return [u'Random sequence generation', u'Allocation concealment', u'Blinding of participants and personnel', u'Blinding of outcome assessment', u'Incomplete outcome data', u'Selective reporting']
class BiasAbRobot: def __init__(self, top_k=3): """ `top_k` refers to 'top-k recall'. top-1 recall will return the single most relevant sentence in the document, and top-3 recall the 3 most relevant. The validation study assessed the accuracy of top-3 and top-1 and we suggest top-3 as default """ self.doc_clf = MiniClassifier( robotreviewer.get_data(os.path.join('bias_ab', 'bias_ab.npz'))) self.vec = ModularVectorizer(norm=None, non_negative=True, binary=True, ngram_range=(1, 2)) self.bias_domains = [ 'random_sequence_generation', 'allocation_concealment', 'blinding_participants_personnel' ] self.top_k = top_k def api_annotate(self, articles, top_k=None): """ Annotate full text of clinical trial report `top_k` can be overridden here, else defaults to the class default set in __init__ """ if not all( ('ab' in article) and ('ti' in article) for article in articles): raise Exception( 'Abstract bias model requires titles and abstracts to be able to complete annotation' ) if top_k is None: top_k = self.top_k out = [] for article in articles: doc_text = article['ti'] + " " + article['ab'] row = {} for domain in self.bias_domains: # # build up document feature set # self.vec.builder_clear() # uni-bigrams self.vec.builder_add_docs([doc_text]) # uni-bigrams/domain interaction self.vec.builder_add_docs([(doc_text, domain)]) x = self.vec.builder_transform() bias_pred = self.doc_clf.predict(x) bias_class = ["high/unclear", "low"][bias_pred[0]] row[domain] = {"judgement": bias_class} out.append(row) return out