def submit_test(nlpql): """ Testing ClarityNLP job """ url = util.claritynlp_url + 'nlpql_tester' log('URL from submit_test: "{0}"'.format(url)) token, oauth = util.app_token() response = requests.post(url, headers=get_headers(token), data=nlpql) if response.status_code == 200: data = response.json() if 'success' in data: if not data['success']: log(data['error'], util.ERROR) return False, data['error'] if 'valid' in data: if not data['valid']: log(data['valid']) return False, data['valid'] # log("\n\nJob Response:\n") # log(data) return True, data else: log(response.status_code) log(response.reason) return False, { 'success': False, 'status_code': response.status_code, 'reason': str(response.reason), 'valid': False }
def submit_job(nlpql_json): """ Submitting ClarityNLP job """ url = util.claritynlp_url + 'phenotype?background=false' log('URL from submit_job: "{0}"'.format(url)) phenotype_string = json.dumps(nlpql_json) log("POSTing phenotype:") log(nlpql_json.get('name')) log("") token, oauth = util.app_token() response = requests.post(url, headers=get_headers(token), data=phenotype_string) if response.status_code == 200: data = response.json() if 'success' in data: if not data['success']: log(data['error'], util.ERROR) return False, data['error'] # log("\n\nJob Response:\n") # log(data) return True, data else: log(response.status_code) log(response.reason) return False, response.reason
def get_reports(source_id): """ Get reports based on generated source """ url = '{}/select?indent=on&q=source:{}&wt=json&rows=1000'.format( util.solr_url, source_id) token, oauth = util.app_token() response = requests.get(url, headers=get_headers(token)) if response.status_code == 200: res = response.json()['response'] if not res: res = {'docs': []} return True, res['docs'] else: return False, {'reason': response.reason}
def delete_report(source_id): """ Deleting reports based on generated source """ url = util.solr_url + 'update?commit=true' log('URL from delete_report: "{0}"'.format(url)) data = '<delete><query>source:%s</query></delete>' % source_id token, oauth = util.app_token() response = requests.post(url, headers=get_headers(token), data=json.dumps(data, indent=4)) if response.status_code == 200: return True, response.reason else: return False, response.reason
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/css/bootstrap.min.css"> </head> <body> <div class="container"> <h1>Upload CSV for NLPQL Form Parsing</h1> <br> <div> <a href="https://github.com/ClarityNLP/Utilities/blob/master/custom_query/afib.csv" target="_blank"> (See sample CSV) </a> </div> <form method=post enctype=multipart/form-data> <br> <input type=text name="formname" class="form-control" placeholder="Form Name"> <br> <input type=file name=file class="form-control-file"> <br> <input type=submit value=Upload class="btn btn-primary"> </form> </div> </body> ''' if __name__ == '__main__': util.app_token() application.run(host='0.0.0.0', port=5000, debug=True) util.set_logger(application.logger)
def get_results(job_id: int, source_data=None, report_ids=None, return_only_if_complete=False, patient_id=-1, name="NLPAAS Job"): log('** JOB ID**') log(job_id) """ Reading Results from Mongo TODO use API endpoint """ if not source_data: source_data = list() if not report_ids: report_ids = list() # Checking if it is a dev box status = "status/%s" % job_id url = util.claritynlp_url + status log('URL from get_results: "{0}"'.format(url)) # Polling for job completion while True: time.sleep(3.0) token, oauth = util.app_token() r = oauth.get(url) if r.status_code != 200: return Response(json.dumps( { 'message': 'Could not query job status from NLP API. Reason: ' + r.reason }, indent=4), status=500, mimetype='application/json'), False try: status = r.json()["status"] except Exception as ex1: log(ex1, util.ERROR) if status == "COMPLETED": break if return_only_if_complete: break if return_only_if_complete and status != "COMPLETED": return ''' { "job_completed": false, "job_id":{}, "status":{} } '''.format(job_id, status), False time.sleep(3) # /phenotype_paged_results/<int:job_id>/<string:phenotype_final_str> """ Submitting ClarityNLP job """ results = list() final_list = list() n = 0 r_formatted = '' while n < 10: url = "{}phenotype_paged_results/{}/{}".format(util.claritynlp_url, job_id, 'true') url2 = "{}phenotype_paged_results/{}/{}".format( util.claritynlp_url, job_id, 'false') response = oauth.get(url) response2 = oauth.get(url2) if response.status_code == 200: results.extend(response.json()['results']) if response2.status_code == 200: results.extend(response2.json()['results']) if len(results) > 0: break else: time.sleep(2.0) n += 1 try: log('') log('total results for {}: {}'.format(name, len(results))) log('') log('') if len(results) == 0: return ''' "success":"true", "message":"No results found for job id {}" '''.format(job_id), False for r in results: # log('** REPORT (R)**', util.INFO) # log(r, util.INFO) r_formatted = json.dumps(r, indent=4) report_id = r['report_id'] source = r['source'] # Three types of result objects 'r' to handle: # 1) Result objects from ClarityNLP, computed via NLPQL # These have been ingested into Solr, static # All the expected fields present # 2) Result objects temporarily loaded into Solr via JSON blob # The JSON blob is POSTed to NLPaaS # The doc_index and source fields constructed differently # from normal Solr ingest process # 3) Result objects obtained from FHIR server via CQL call # CQLExecutionTask returns this data # No underlying source document at all, so no report_text pipeline_type = r['pipeline_type'].lower() if 'cqlexecutiontask' == pipeline_type: # no source docs, data obtained via CQL query r['report_text'] = '' r['report_type'] = r.get('resourceType', 'Unknown') final_list.append(r) continue if len(report_ids) > 0 and report_id not in report_ids: continue # compute the doc_index encoded in the source field try: doc_index = int( report_id.replace(source, '').replace('_', '')) - 1 except ValueError as ve: doc_index = -1 log("non-integer source index", util.ERROR) log(ve, util.ERROR) log(r_formatted) if doc_index == -1 and patient_id != -1: r['report_text'] = '' elif len(source_data) > 0 and doc_index < len(source_data): source_doc = source_data[doc_index] r['report_text'] = source_doc['report_text'] else: r['report_text'] = r['sentence'] final_list.append(r) result_string = dumps(final_list) return result_string, True except Exception as ex: log(ex, util.ERROR) log(r_formatted) return ''' "success":"false", "message":{} '''.format(str(ex)), False
def upload_reports(data, access_token=None): """ Uploading reports with unique source """ url = util.solr_url + 'update?commit=true&commitWithin=10000' log('URL from upload_reports: "{0}"'.format(url)) # Generating a source_id rand_uuid = uuid.uuid1() source_id = str(rand_uuid) payload = list() report_list = list() nlpaas_id = 1 fhir_resource = False # print('**DATA**') # print(json.dumps(data, indent=4)) for report in data['reports']: report_id = '{}_{}'.format(source_id, str(nlpaas_id)) json_body = { "report_type": "ClarityNLPaaS Document", "id": report_id, "report_id": report_id, "source": source_id, "nlpaas_id": str(nlpaas_id), "subject": "ClarityNLPaaS Subject", "report_date": "1970-01-01T00:00:00Z", 'original_report_id': '' } if type(report) == str: json_body["report_text"] = report else: resource_type = '' if 'resource' in report: report_resource = report.get('resource') report_resource['fullUrl'] = report.get('fullUrl') report = report_resource if 'resourceType' in report: resource_type = report['resourceType'] report_date = None if 'created' in report: report_date = dateparser.parse(report['created']) if not report_date and 'indexed' in report: report_date = dateparser.parse(report['indexed']) if report_date: # The 'report_date' variable is a python 'datetime' object. # For Solr ingest, need to: # 1) convert to UTC # 2) format as Solr wants it utc_report_date = report_date.astimezone( tz=datetime.timezone.utc) json_body['report_date'] = utc_report_date.strftime( '%Y-%m-%dT%H:%M:%SZ') if 'subject' in report: if 'reference' in report['subject']: subject = report['subject']['reference'] else: subject = str(report['subject']) if '/' in subject: # subject usually returned as 'Patient/12345' or similar subject = subject.split('/')[-1] json_body['subject'] = subject if 'id' in report: json_body['original_report_id'] = str(report['id']) if 'type' in report: if 'coding' in report['type'] and len( report['type']['coding']) > 0: coded_type = report['type']['coding'][0] if 'display' in coded_type: json_body['report_type'] = coded_type['display'] if resource_type == 'DocumentReference' or resource_type == 'DiagnosticReport': fhir_resource = True txt = '' # log('** REPORT **') # log(report) if 'content' in report: for c in report['content']: attachment = c.get('attachment', None) if attachment: content_type = attachment.get( 'contentType', 'text/plain') report_data = attachment.get('data', None) if content_type and report_data: decoded_txt = base64.b64decode( report_data).decode("utf-8") if content_type == 'application/pdf': if 'url' in c['attachment']: url = attachment.get('url') types = [ 'application/json+fhir', content_type ] txt = '' for t in list(set(types)): if txt == '': if access_token: headers = { 'Accept': t, 'Authorization': 'Bearer {}'.format( access_token) } else: headers = {'Accept': t} if 'json' in t: txt = get_text( url, headers, key='content') else: txt = get_text( url, headers) elif 'xml' in content_type or 'html' in content_type: clean_txt = re.sub('<[^<]+?>', '', decoded_txt) txt += clean_txt txt += '\n' else: txt += decoded_txt txt += '\n' elif 'data' in c['attachment']: decoded_txt = base64.b64decode( report_data).decode("utf-8") txt += decoded_txt txt += '\n' json_body["report_text"] = txt else: json_body["report_text"] = str(report) if len(json_body["report_text"]) > 0: payload.append(json_body) report_list.append(report_id) nlpaas_id += 1 log('{} total documents'.format(len(payload))) # log('** PAYLOAD **', util.INFO) # log(payload, util.INFO) if len(payload) > 0: token, oauth = util.app_token() log('uploading solr docs...') response = requests.post(url, headers=get_headers(token), data=json.dumps(payload, indent=4)) if response.status_code == 200: the_time = 0 while True: log('checking for solr upload...') data = dict() data['query'] = "*:*" data['params'] = {'wt': 'json'} data['filter'] = 'source:"{}"'.format(source_id) doc_results = 0 try: post_data = json.dumps(data, indent=4) response = requests.post((util.solr_url + '/select'), headers=get_headers(token), data=post_data) # log(response.text) res = response.json().get('response', None) if res: doc_results = int(res.get('numFound', 0)) except Exception as ex: log(ex, util.ERROR) log("unable to query docs", util.ERROR) if doc_results > 0: log("documents uploaded {}".format(doc_results), util.INFO) break the_time += 1 time.sleep(1) if the_time > 15: log("documents not yet loaded in 15 sec", util.ERROR) break return True, source_id, report_list, fhir_resource, payload else: return False, response.reason, report_list, fhir_resource, payload else: return True, "All documents were empty or invalid, or no documents were passed in.", report_list, \ fhir_resource, payload