def _get_stmt_row(stmt, source, model, cur_counts, test_corpus=None, path_counts=None, cur_dict=None, with_evid=False): stmt_hash = str(stmt.get_hash()) english = _format_stmt_text(stmt) evid_count = len(stmt.evidence) evid = [] if with_evid and cur_dict is not None: evid = _format_evidence_text( stmt, cur_dict, ['correct', 'act_vs_amt', 'hypothesis'])[:10] params = { 'stmt_hash': stmt_hash, 'source': source, 'model': model, 'format': 'json' } if test_corpus: params.update({'test_corpus': test_corpus}) url_param = parse.urlencode(params) json_link = f'/evidence?{url_param}' path_count = 0 if path_counts: path_count = path_counts.get(stmt_hash) badges = _make_badges(evid_count, json_link, path_count, cur_counts.get(stmt_hash)) stmt_row = [(stmt.get_hash(), english, evid, evid_count, badges)] return stmt_row
def test_tag_bad_text(): ev = Evidence("bogus", text="<Foo> binds Bar& (<10 & >20)", annotations={"agents": { "raw_text": ["<Foo>", "Bar&"] }}) stmt = Complex([Agent("Foo"), Agent("Bar")], evidence=[ev]) ev_list = _format_evidence_text(stmt) fmt_ev = ev_list[0] assert fmt_ev['text'] == ( "<span class=\"badge badge-other\"><Foo>" "</span> binds <span class=\"badge badge-other\">" "Bar&</span> (<10 & >20)")
def get_tests_by_hash(test_corpus, hash_val): tests = _load_tests_from_cache(test_corpus) curations = get_curations(pa_hash=hash_val) cur_dict = defaultdict(list) for cur in curations: cur_dict[(cur.pa_hash, cur.source_hash)].append({'error_type': cur.tag}) st_json = {} for test in tests: if str(test.stmt.get_hash()) == str(hash_val): st_json = test.stmt.to_json() ev_list = _format_evidence_text( test.stmt, cur_dict, ['correct', 'act_vs_amt', 'hypothesis']) st_json['evidence'] = ev_list return {'statements': {hash_val: st_json}}
def test_format_evidence_text(): stmt = make_stmt() ev_list = _format_evidence_text(stmt) assert len(ev_list) == 1 ev = ev_list[0] assert isinstance(ev, dict) assert set(ev.keys()) == { 'source_api', 'text_refs', 'text', 'source_hash', 'pmid', 'num_curations', 'num_correct', 'num_incorrect' } assert ev['source_api'] == 'test' assert ev['text_refs']['PMID'] == '1234567' assert ev['text'] == ( 'We noticed that the ' '<span class="badge badge-subject">Src kinase</span> ' 'was able to phosphorylate ' '<span class="badge badge-object">' 'Ras proteins</span>.'), ev['text']
def decorator(*args, **kwargs): tracker = LogTracker() start_time = datetime.now() logger.info("Got query for %s at %s!" % (get_db_query.__name__, start_time)) web_query = request.args.copy() offs = _pop(web_query, 'offset', type_cast=int) ev_lim = _pop(web_query, 'ev_limit', type_cast=int) best_first = _pop(web_query, 'best_first', True, bool) max_stmts = min(_pop(web_query, 'max_stmts', MAX_STATEMENTS, int), MAX_STATEMENTS) fmt = _pop(web_query, 'format', 'json') w_english = _pop(web_query, 'with_english', False, bool) w_cur_counts = _pop(web_query, 'with_cur_counts', False, bool) # Figure out authorization. has = dict.fromkeys(['elsevier', 'medscan'], False) if not TESTING: user, roles = resolve_auth(web_query) for role in roles: for resource in has.keys(): has[resource] |= role.permissions.get(resource, False) logger.info('Auths: %s' % str(has)) else: web_query.pop('api_key', None) has['elsevier'] = False has['medscan'] = False # Actually run the function. logger.info("Running function %s after %s seconds." % (get_db_query.__name__, sec_since(start_time))) db_query = get_db_query(web_query, *args, **kwargs) if isinstance(db_query, Response): return db_query elif not isinstance(db_query, QueryCore): raise RuntimeError("Result should be a child of QueryCore.") if ev_lim is None: if get_db_query is get_statement_by_hash: ev_lim = 10000 else: ev_lim = 10 if not has['medscan']: minus_q = ~HasOnlySource('medscan') db_query &= minus_q ev_filter = minus_q.ev_filter() else: ev_filter = None result = db_query.get_statements(offset=offs, limit=max_stmts, ev_limit=ev_lim, best_first=best_first, evidence_filter=ev_filter) logger.info("Finished function %s after %s seconds." % (get_db_query.__name__, sec_since(start_time))) # Handle any necessary redactions res_json = result.json() stmts_json = res_json.pop('results') elsevier_redactions = 0 source_counts = result.source_counts if not all(has.values()) or fmt == 'json-js' or w_english: for h, stmt_json in stmts_json.copy().items(): if w_english: stmt = stmts_from_json([stmt_json])[0] stmt_json['english'] = _format_stmt_text(stmt) stmt_json['evidence'] = _format_evidence_text(stmt) if has['elsevier'] and fmt != 'json-js' and not w_english: continue if not has['medscan']: source_counts[h].pop('medscan', 0) for ev_json in stmt_json['evidence'][:]: if fmt == 'json-js': ev_json['source_hash'] = str(ev_json['source_hash']) # Check for elsevier and redact if necessary if not has['elsevier'] and \ get_source(ev_json) == 'elsevier': text = ev_json['text'] if len(text) > 200: ev_json['text'] = text[:200] + REDACT_MESSAGE elsevier_redactions += 1 logger.info(f"Redacted {elsevier_redactions} pieces of elsevier " f"evidence.") logger.info("Finished redacting evidence for %s after %s seconds." % (get_db_query.__name__, sec_since(start_time))) # Get counts of the curations for the resulting statements. if w_cur_counts: curations = get_curations(pa_hash=set(stmts_json.keys())) logger.info("Found %d curations" % len(curations)) cur_counts = {} for curation in curations: # Update the overall counts. if curation.pa_hash not in cur_counts: cur_counts[curation.pa_hash] = 0 cur_counts[curation.pa_hash] += 1 # Work these counts into the evidence dict structure. for ev_json in stmts_json[curation.pa_hash]['evidence']: if str(ev_json['source_hash']) == str( curation.source_hash): ev_json['num_curations'] = \ ev_json.get('num_curations', 0) + 1 break res_json['num_curations'] = cur_counts # Add derived values to the res_json. res_json['offset'] = offs res_json['evidence_limit'] = ev_lim res_json['statement_limit'] = MAX_STATEMENTS res_json['statements_returned'] = len(stmts_json) res_json['end_of_statements'] = (len(stmts_json) < MAX_STATEMENTS) res_json['statements_removed'] = 0 res_json['evidence_returned'] = result.returned_evidence if fmt == 'html': title = TITLE + ': ' + 'Results' ev_totals = res_json.pop('evidence_totals') stmts = stmts_from_json(stmts_json.values()) html_assembler = HtmlAssembler(stmts, res_json, ev_totals, source_counts, title=title, db_rest_url=request.url_root[:-1]) idbr_template = env.get_template('idbr_statements_view.html') identity = user.identity() if user else None content = html_assembler.make_model(idbr_template, identity=identity) if tracker.get_messages(): level_stats = [ '%d %ss' % (n, lvl.lower()) for lvl, n in tracker.get_level_stats().items() ] msg = ' '.join(level_stats) content = html_assembler.append_warning(msg) mimetype = 'text/html' else: # Return JSON for all other values of the format argument res_json.update(tracker.get_level_stats()) res_json['statements'] = stmts_json res_json['source_counts'] = source_counts content = json.dumps(res_json) mimetype = 'application/json' resp = Response(content, mimetype=mimetype) logger.info("Exiting with %d statements with %d/%d evidence of size " "%f MB after %s seconds." % (res_json['statements_returned'], res_json['evidence_returned'], res_json['total_evidence'], sys.getsizeof(resp.data) / 1e6, sec_since(start_time))) return resp
def process_entries(self, result): if result.result_type == 'hashes': # There is really nothing to do for hashes. return elsevier_redactions = 0 if not all(self.has.values()) or self.fmt == 'json-js' \ or self.w_english: for key, entry in result.results.copy().items(): # Build english reps of each result (unless their just hashes) if self.w_english and result.result_type != 'hashes': stmt = None # Fix the agent order if self.strict: if result.result_type == 'statements': stmt = stmts_from_json([entry])[0] if type(stmt) == Complex: id_lookup = {v: int(k) for k, v in self.agent_dict.items()} stmt.members.sort( key=lambda ag: id_lookup.get(ag.name, 10) ) agent_set = {ag.name for ag in stmt.agent_list() if ag is not None} else: agent_set = set(entry['agents'].values()) if result.result_type == 'relations' \ and entry['type'] == 'Complex': entry['agents'] = self.agent_dict if agent_set < self.agent_set: result.results.pop(key, None) continue # Construct the english. if result.result_type == 'statements': if stmt is None: stmt = stmts_from_json([entry])[0] eng = _format_stmt_text(stmt) entry['evidence'] = _format_evidence_text(stmt) else: eng = _make_english_from_meta(entry['agents'], entry.get('type')) if not eng: logger.warning(f"English not formed for {key}:\n" f"{entry}") entry['english'] = eng # Filter out medscan if user does not have medscan privileges. if not self.has['medscan']: if result.result_type == 'statements': result.source_counts[key].pop('medscan', 0) else: result.evidence_counts[key] -= \ entry['source_counts'].pop('medscan', 0) entry['total_count'] = result.evidence_counts[key] if not entry['source_counts']: logger.warning("Censored content present.") # In most cases we can stop here if self.has['elsevier'] and self.fmt != 'json-js' \ and not self.w_english: continue if result.result_type == 'statements': # If there is evidence, loop through it if necessary. for ev_json in entry['evidence'][:]: if self.fmt == 'json-js': ev_json['source_hash'] = str(ev_json['source_hash']) # Check for elsevier and redact if necessary if not self.has['elsevier'] and \ get_source(ev_json) == 'elsevier': text = ev_json['text'] if len(text) > 200: ev_json['text'] = text[:200] + REDACT_MESSAGE elsevier_redactions += 1 elif result.result_type != 'hashes' and self.fmt == 'json-js': # Stringify lists of hashes. if 'hashes' in entry and entry['hashes'] is not None: entry['hashes'] = [str(h) for h in entry['hashes']] elif 'hash' in entry: entry['hash'] = str(entry['hash']) if result.result_type == 'statements': logger.info(f"Redacted {elsevier_redactions} pieces of elsevier " f"evidence.") logger.info(f"Process entries for {self.__class__.__name__} after " f"{sec_since(self.start_time)} seconds.") return