def get_nct_list_from_zip(input_zip, mile_range=50): zcdb = ZipCodeDatabase() zip_list = [ z.zip for z in zcdb.get_zipcodes_around_radius(input_zip, mile_range) ] # default mile range set at 100 conn = general_pool_criteria.connection() cur = conn.cursor() # conn.execute("USE trial_knowledge_base_COVID19") ## sql = ''' ;with cte (code, DeclarationItem, Declaration) as ( select nct_id, cast(left(zip_codes, charindex('|',zip_codes+'|')-1) as varchar(50)) DeclarationItem, stuff(zip_codes, 1, charindex('|',zip_codes+'|'), '') Declaration from dbo.aact_trial_info union all select code, cast(left(Declaration, charindex('|',Declaration+'|')-1) as varchar(50)) DeclarationItem, stuff(Declaration, 1, charindex('|',Declaration+'|'), '') Declaration from cte where Declaration > '' ) select code as nct_id, DeclarationItem as zip from cte order by nct_id asc option (maxrecursion 0); ''' cur.execute(sql) trial_zips = cur.fetchall() cur.close() conn.close() # compare nearby zip codes to trial zip codes nearby_nct_list = [] for item in trial_zips: test_nct = item[0] if item[1] is not None: test_zip = item[1].split('-')[0] else: test_zip = 00000 if test_zip in zip_list: nearby_nct_list.append( test_nct ) # some zip codes stored with '-xxxx' after primary 5 digit, pyzipcode no like that nearby_nct_list = list(set(nearby_nct_list)) # temp loop to add trial number, to be removed in the future temp_return_list = [] i = 1 for item in nearby_nct_list: temp_return_list.append('%s;%d' % (item, i)) i += 1 # return temp_return_list # fengyang: also return the nearby zipcode list return [temp_return_list, zip_list]
def statistics(): parameters = {} conn = general_pool_criteria.connection() cur = conn.cursor() cur.execute("select location from dbo.access_stat") zip_code = cur.fetchall() zip_code = [str((item[0]).encode('ascii', 'ignore')) for item in zip_code] result = Counter(zip_code) result = sorted(result.items(), key=lambda obj: obj[1]) parameters['zip_code_range'] = [e[0] for e in result] parameters['zip_code_count'] = [e[1] for e in result] cur.execute("select trial_type from dbo.access_stat") trial_type = cur.fetchall() trial_type = [e[0].encode('ascii', 'ignore') for e in trial_type] parameters['type_trials_dict'] = Counter(trial_type) cur.execute("select age from dbo.access_stat") age = cur.fetchall() age = [int(item[0]) for item in age] result = Counter(age) if 99990 in result: result.pop(99990) result = sorted(result.items(), key=lambda obj: obj[0]) parameters['age_range'] = [e[0] for e in result] parameters['age_count'] = [e[1] for e in result] cur.execute("select exposure from dbo.access_stat") exposure = cur.fetchall() exposure = [e[0].encode('ascii', 'ignore') for e in exposure] parameters['exposure_dict'] = Counter(exposure) cur.execute("select domain from dbo.access_stat") domain = cur.fetchall() domain = [e[0].encode('ascii', 'ignore') for e in domain] parameters['domain_dict'] = Counter(domain) cur.execute("select user_picked_time from dbo.access_stat") user_picked_time_l = cur.fetchall() cur.execute("select stat from dbo.access_stat") stat = cur.fetchall() stat = [e[0].encode('ascii', 'ignore') for e in stat] parameters['stat_dict'] = Counter(stat) cur.execute("select preg from dbo.access_stat") preg = cur.fetchall() preg = [e[0].encode('ascii', 'ignore') for e in preg] parameters['preg_dict'] = Counter(preg) conn.close() cur.close() return render_template('statistics.html', parameters=parameters)
def start_question_detail(): age = request.args.get('age') # gender = request.args.get('gender') exposure = request.args.get('exposure') domain = request.args.get('domain') user_picked_time = request.args.get('user_picked_time') stat = request.args.get('stat') preg = request.args.get('preg') miles = request.args.get('miles') pre_quest_answers = [age, exposure, domain, user_picked_time, stat, preg] locn = request.args.get('locn') trial_type = request.args.get('trial_type') #Save user access info into the database rnd = ''.join(str(random.choice(range(6))) for _ in range(6)) tim = time.strftime('%Y%m%d%H%M%S') session_id = rnd + tim age = int(age.encode('ascii', 'ignore')) if 0 <= age < 10: age = 10 else: age = int(age / 10) * 10 conn = general_pool_criteria.connection() cur = conn.cursor() sql = "insert into dbo.access_stat values ('%s', '%s', '%s', '%s','%s','%s','%s','%s','%s','%s')" % ( session_id, str(locn), str(miles), trial_type, str(age), exposure, domain, user_picked_time or '', stat, preg) #print sql cur.execute(sql) conn.commit() conn.close() cur.close() # get trials and tags rnct = session.get('working_nct_list_pre_process') working_nct_id_list = qst.init_working_nct_id_list(rnct, pre_quest_answers) question_answer_list = [] if len(working_nct_id_list) > 0: question_answer_list = qst.find_new_question(question_answer_list, working_nct_id_list) log.info('%s -- first question' % (request.remote_addr)) return jsonify(question_answer_list=question_answer_list, working_nct_id_list=working_nct_id_list)
def find_nct_loc_within_range(loc_latlng_list, nct_id_list, input_zipcode_latlng, input_miles): ''' :param loc_latlng_list: [{'lat': , 'lon': }] :param input_zipcode_latlng: {'lat': , 'lon': } :param input_miles: number :return: ''' lat1 = input_zipcode_latlng['lat'] lng1 = input_zipcode_latlng['lng'] # allow distance error +3 distance_list = [ haversine(lng1, lat1, loc['lng'], loc['lat']) + 3 if loc is not None else 999999 for loc in loc_latlng_list ] filter_index = np.where(np.array(distance_list) <= input_miles)[0] # loc_within_range = np.array(loc_latlng_list)[np.where(np.array(distance_list) <= input_miles)[0]] loc_within_range = np.array(loc_latlng_list)[filter_index] loc_within_range = list(loc_within_range) nct_within_range = np.array(nct_id_list)[filter_index] nct_within_range = list(nct_within_range) # get trial title conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select nct_id, official_title from dbo.aact_trial_info ''' cur.execute(sql) trial_title = cur.fetchall() conn.close() cur.close() nct_id_title = {} if len(trial_title) > 0: for r in trial_title: if len(r) >= 2: nct_id_title[r[0]] = r[1] nct_title_within_range = [ nct_id_title[nct_id] for nct_id in nct_within_range ] return [loc_within_range, nct_within_range, nct_title_within_range]
def get_nct_list_from_keywords(keyword_string): temp = keyword_string.replace('; ', ';') keyword_list = temp.split(';') # create dynamic string to handle keyword search in sql key_search = "" if len(keyword_list) > 0 and keyword_list[0] != '': key_search = "where " for item in [x.lower() for x in keyword_list]: key_search = key_search + "(lower(condition_names) like '%" + item + "%' or " key_search = key_search + "lower(intervention_names) like '%" + item + "%' or " key_search = key_search + "lower(official_title) like '%" + item + "%' or " key_search = key_search + "lower(outcome_measure) like '%" + item + "%') and " key_search = key_search[:-5] # send query to server conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select nct_id from dbo.aact_trial_info %s order by nct_id asc ''' % key_search print(sql) cur.execute(sql) key_trials = cur.fetchall() conn.close() cur.close() # clean up items being returned to_return = [] if len(key_trials) > 0: for item in key_trials: to_return.append(item[0]) # else: # to_return.append('Error') return to_return
def query_trial_info_for_modal(nct_id): conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select official_title, study_type, primary_purpose, study_description, gender, minimum_age, maximum_age, healthy_volunteers, phase, allocation, intervention_model, observation_model, masking, outcome_measure, outcome_description, facilities_and_contacts, intervention_names, central_contacts from dbo.aact_trial_info where nct_id = '%s' ''' % nct_id cur.execute(sql) trial_data = cur.fetchall() conn.close() cur.close() # clean up items being returned to_return = [] if len(trial_data) == 1: to_return.extend(trial_data[0]) else: to_return.append('Error') return to_return
def find_annotated_nct_id_list(): ''' find annotated working nct id list :param: none :return: a working nct id list in our annotation list ''' annotated_working_list = [] conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select distinct(nct_id) from dbo.aact_trial_info_us where status = 'Recruiting' and nct_id in ( select distinct(nct_id) from dbo.all_criteria ); ''' cur.execute(sql) nctids = cur.fetchall() conn.close() cur.close() if len(nctids) > 0: for nctid in nctids: annotated_working_list.append(str(nctid[0])) return annotated_working_list
def update_working_nct_id_list(question_answer_list, working_nct_id_list): ''' update working_nct_id_list by comparing question_answer_list with criteria knowledge base :param question_answer_list: :param working_nct_id_list: :return: an updated working_nct_id_list working_nct_id_list = [('NCT02901717', 3431, 0), ('NCT01287182', 3432, 0),('NCT01035944', 3432, 0),('NCT00562068', 3431, 1),('NCT00742300', 3431, 2),] question_answer_list = [{'answer': {}, 'question': (3, u'pregnant')}] ''' question_number = len(question_answer_list) if question_number > 0: this_qa = question_answer_list[question_number - 1] this_entity_text = this_qa['question']['entity_text'] this_domain = this_qa['question']['domain'] table_name = 'dbo.dquest_omop_clean_' + this_domain if 'answer' not in this_qa.keys(): return working_nct_id_list this_answer = this_qa['answer'] this_include = this_answer['include'] if this_domain.lower() != 'measurement': rangestart = 0 rangeend = 0 if 'rangestart' in this_answer.keys(): rangestart = this_answer['rangestart'] if 'rangeend' in this_answer.keys(): rangeend = this_answer['rangeend'] if this_include == 'INC': sql = ''' select distinct nctid from %s where concept_cluster_name in ('%s') and ( (flag = 0 and beforedays >= %s) or (flag = 1 and beforedays < %s) ) ''' % (table_name, this_entity_text, rangeend, rangeend) else: sql = ''' select distinct nctid from %s where concept_cluster_name in ('%s') and ( (flag = 1) ) ''' % (table_name, this_entity_text) else: if 'measurement_value' in this_answer.keys( ) and this_include == 'INC': measurement_value = this_answer['measurement_value'] sql = ''' select distinct nctid from %s where concept_cluster_name in ('%s') and ( ( flag = 0 and (min <= %s and max >= %s) ) or ( flag = 1 and (min > %s or max < %s) ) ) ''' % (table_name, this_entity_text, measurement_value, measurement_value, measurement_value, measurement_value) else: sql = ''' select top(0) nctid from %s ''' % (table_name) conn = general_pool_criteria.connection() cur = conn.cursor() cur.execute(sql) details = cur.fetchall() filtered_nct_id = [nct_id[0] for nct_id in details] conn.close() cur.close() for nct_record in working_nct_id_list: if nct_record[0] in filtered_nct_id and nct_record[2] == 0: nct_record[2] = question_number return working_nct_id_list else: return working_nct_id_list # print(update_working_nct_id_list(question_answer_list,working_nct_id_list))
def find_new_question(question_answer_list, working_nct_id_list, domain='all'): ''' find new question by frequency. alternatively, information entropy should be considered sum(plog(p)) :param question_answer_list: questions already answered or skipped with their corresponding answers :param working_nct_id_list: a working nct id list :return: a updated question_answer_list by appending a new question Example working_nct_id_list = [['NCT02901717', 3431, 0], ['NCT01287182', 3432, 0],['NCT01035944', 3432, 0],['NCT00562068', 3431, 1], ['NCT00742300', 3431, 2]] question_answer_list = [{'answer': {}, 'question': (3, u'pregnant')}] ''' # working_nct_id_frame = pd.DataFrame(working_nct_id_list,columns=['nct_id', 'ctgov_rank', 'num_of_question']) working_nct_id_0 = [ record[0] for record in working_nct_id_list if record[2] == 0 ] placeholders1 = ",".join("?" * len(working_nct_id_0)) ######################################################################################################################## # placeholders1 = ",".join("?" * 2000) # working_nct_id_0 = [record[0] for record in working_nct_id_list if record[2] == 0][0:2000] # ERROR is raised if the nct list is larger than 2000 # Use subsampling to solve this issue. # Select the first 2000 is better. if len(working_nct_id_0) > 2000: placeholders1 = ",".join("?" * 2000) working_nct_id_0 = working_nct_id_0[0:2000] ######################################################################################################################## domain = domain.lower() conn = general_pool_criteria.connection() cur = conn.cursor() if domain != 'all': table_name = 'dbo.dquest_omop_clean_' + domain placeholders2 = "?" active_question_0 = [ qa['question']['entity_text'] for qa in question_answer_list if qa['question']['domain'] == domain ] placeholders3 = ",".join("?" * len(active_question_0)) params = [] params.extend(working_nct_id_0) params.extend([domain]) if len(active_question_0) == 0: sql = """ select top(1) count(distinct nctid) AS count, concept_cluster_name from %s where nctid in (%s) and concept_cluster_name is NOT NULL and domain = %s group by concept_cluster_name order by count(distinct nctid) desc """ % (table_name, placeholders1, placeholders2) else: params.extend(active_question_0) sql = """ select top(1) count(distinct nctid) AS count, concept_cluster_name from %s where nctid in (%s) and concept_cluster_name is NOT NULL and domain = %s and concept_cluster_name not in (%s) group by concept_cluster_name order by count(distinct nctid) desc """ % (table_name, placeholders1, placeholders2, placeholders3) cur.execute(sql, params) next_concept = cur.fetchall() conn.close() cur.close() if len(next_concept) > 0: this_q = { 'question': { 'domain': domain, 'entity_text': next_concept[0][1] } } else: this_q = {'question': {'domain': domain, 'entity_text': 'NQF'}} question_answer_list.append(this_q) else: active_question_0 = [ qa['question']['entity_text'] for qa in question_answer_list ] placeholders2 = ",".join("?" * len(active_question_0)) params = [] params.extend(working_nct_id_0) if len(active_question_0) == 0: sql = """ select top(1) count(distinct nctid) AS count, concept_cluster_name, domain from dbo.dquest_omop_clean_condition where nctid in (%s) and concept_cluster_name is NOT NULL group by concept_cluster_name,domain order by count(distinct nctid) desc """ % (placeholders1) print(sql) else: params.extend(active_question_0) sql = """ select top(1) count(distinct nctid) AS count, concept_cluster_name, domain from dbo.dquest_omop_clean_condition where nctid in (%s) and concept_cluster_name is NOT NULL and concept_cluster_name not in (%s) group by concept_cluster_name,domain order by count(distinct nctid) desc """ % (placeholders1, placeholders2) cur.execute(sql, params) next_concept = cur.fetchall() conn.close() cur.close() if len(next_concept) > 0: this_q = { 'question': { 'domain': next_concept[0][2].lower(), 'entity_text': next_concept[0][1] } } else: this_q = { 'question': { 'domain': 'condition', 'entity_text': 'NQF' } } question_answer_list.append(this_q) return question_answer_list
def find_active_nct_id_list(active_restriction, trial_type='all'): ''' find annotated working nct id list which is actively recruiting :param: none :return: a working nct id list in our annotation list ''' # creating set of trial statuses based on user entry if active_restriction == 'true': active_restriction = True else: active_restriction = False if active_restriction: status_terms = ",".join( str("'" + x + "'") for x in ['Recruiting', 'Enrolling by Invitation', 'Available']) else: status_terms = 'select distinct status from dbo.aact_trial_info' active_trial_list = [] conn = general_pool_criteria.connection() cur = conn.cursor() # managing different trial type based on user entry if trial_type == 'all': sql = ''' select distinct nct_id_desc, pt_cohort, value from dbo.key_criteria_v2 cross apply string_split(nct_id_desc, '_') where [value] in (select nct_id from dbo.aact_trial_info where status in (%s)) order by nct_id_desc; ''' % status_terms else: type_search_terms = '' if trial_type == 'intervention': type_search_terms = ",".join( str("'" + x + "'") for x in ['interventional', 'expanded access']) elif trial_type == 'observation': type_search_terms = ",".join( str("'" + x + "'") for x in ['observational', 'observational [patient registry]']) sql = ''' select distinct nct_id_desc, pt_cohort, value from dbo.key_criteria_v2 cross apply string_split(nct_id_desc, '_') where [value] in (select nct_id from dbo.aact_trial_info where status in (%s) and lower(study_type) in (%s)) order by nct_id_desc; ''' % (status_terms, type_search_terms) cur.execute(sql) nctids = cur.fetchall() conn.close() cur.close() if len(nctids) > 0: for nctid in nctids: active_trial_list.append( str(nctid[0]) + ';' + str(nctid[2]) + ';' + str(nctid[1])) return active_trial_list
def update_working_nct_id_list(question_answer_list, working_nct_id_list): ''' update working_nct_id_list by comparing question_answer_list with criteria knowledge base :param question_answer_list: :param working_nct_id_list: :return: an updated working_nct_id_list working_nct_id_list = [('NCT02901717', 3431, 0), ('NCT01287182', 3432, 0),('NCT01035944', 3432, 0),('NCT00562068', 3431, 1),('NCT00742300', 3431, 2),] question_answer_list = [{'answer': {}, 'question': (3, u'pregnant')}] ''' question_number = len(question_answer_list) if question_number > 0: this_qa = question_answer_list[question_number - 1] this_entity_text = this_qa['question']['entity_text'] this_domain = this_qa['question']['domain'] table_name = 'dbo.all_criteria_v2' if 'answer' not in this_qa.keys(): return working_nct_id_list this_answer = this_qa['answer'] this_include = this_answer['include'] if this_domain.lower() != 'measurement': rangestart = 0 rangeend = 0 if 'rangestart' in this_answer.keys(): rangestart = this_answer['rangestart'] if 'rangeend' in this_answer.keys(): rangeend = this_answer['rangeend'] if this_include == 'INC': sql = ''' select distinct nct_id_original from %s where (LOWER (concept_name) = LOWER ('%s') and is_exclusion = 0 and concept_group_id is null and (1 = case when before_days != 0 and (%s > before_days) then 1 else 0 end)) OR (lower(concept_name) = lower('%s') and is_exclusion = 1) ''' % (table_name, this_entity_text, rangeend, this_entity_text) else: sql = ''' select distinct nct_id_original from %s where lower(concept_name) = lower('%s') and is_exclusion = 0 ''' % (table_name, this_entity_text) else: if 'measurement_value' in this_answer.keys( ) and this_include == 'INC': measurement_value = this_answer['measurement_value'] print('meas_value: ' + str(measurement_value)) if measurement_value.isdigit(): sql = ''' select distinct nct_id_original from %s where (lower(concept_name) = lower('%s') and %s <= numeric_att_max and %s >= numeric_att_min AND is_exclusion = 1) OR (lower(concept_name) = lower('%s') and (%s > numeric_att_max or %s < numeric_att_min) AND is_exclusion = 0 and concept_group_id is null) ''' % (table_name, this_entity_text, measurement_value, measurement_value, this_entity_text, measurement_value, measurement_value) else: sql = ''' select distinct nct_id_original from %s where lower(concept_name) = lower('%s') and (1= case when (lower('%s') = 'pos' or lower('%s') = 'positive') and ((lower(numeric_source_text) = 'negative' and is_exclusion = 0) or (lower(numeric_source_text) like '%%positiv%%' and is_exclusion = 1)) then 1 when (lower('%s') = 'neg' or lower('%s') = 'negative') and ((lower(numeric_source_text) = 'negative' and is_exclusion = 1) or (lower(numeric_source_text) like '%%positiv%%' and is_exclusion = 0)) then 1 else 0 end) ''' % (table_name, this_entity_text, measurement_value, measurement_value, measurement_value, measurement_value) else: sql = ''' select top(0) nct_id_original from %s ''' % (table_name) conn = general_pool_criteria.connection() cur = conn.cursor() cur.execute(sql) details = cur.fetchall() filtered_nct_id = [] filtered_nct_id = [nct_id[0] for nct_id in details] conn.close() cur.close() for c in range(len(working_nct_id_list)): if working_nct_id_list[c][0] in filtered_nct_id: working_nct_id_list[c][3] = question_number return working_nct_id_list else: return working_nct_id_list
def filter_nct_ids_by_pre_questions(answer_list): ''' find working nct id list after filter the answers to pre-questions :param answer_list: the list of the answers to pre-questions :return: a working nct id list in our annotation list ''' answer_list = [str(x) for x in answer_list] age = answer_list[0] # gender = answer_list[1] exposure = answer_list[1] domain = answer_list[2] user_picked_time = answer_list[3] stat = answer_list[4] preg = answer_list[5] # query database filter nctids conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select distinct keyc.nct_id, keyc.pt_cohort from dbo.key_criteria_v2 as keyc left join dbo.aact_trial_info as aact on keyc.nct_id = aact.nct_id where (1= case when %s <> 99999 and ((%s >= aact.minimum_age or aact.minimum_age is null) and (%s <= aact.maximum_age or aact.maximum_age is null) ) then 1 when %s = 99999 then 1 else 0 end ) and (1= case when '%s' = 'yes' and keyc.disease_status in ('yes', 'all') then 1 when '%s' = 'no' and keyc.disease_status in ('no', 'all') then 1 when '%s' = 'cleared' and keyc.disease_status in ('cleared', 'all') then 1 when '%s' = 'idk' and keyc.disease_status in ('yes','no','cleared', 'all') then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.exposure_status = 1 or keyc.exposure_status is null) then 1 when '%s' = 'no' and (keyc.exposure_status = 0 or keyc.exposure_status is null) then 1 when '%s' = 'idk' and (keyc.exposure_status in (1, 0) or keyc.exposure_status is null) then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.is_hospitalized = 1 or keyc.is_hospitalized is null) then 1 when '%s' = 'no' and (keyc.is_hospitalized = 0 or keyc.is_hospitalized is null) then 1 when '%s' = 'idk' and (keyc.is_hospitalized in (1, 0) or keyc.is_hospitalized is null) then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.preg_status = 1 or keyc.preg_status is null) then 1 when ('%s' = 'no' or '%s' = 'n/a') and (keyc.preg_status = 0 or keyc.preg_status is null) then 1 when '%s' = 'idk' and (keyc.preg_status in (1, 0) or keyc.preg_status is null) then 1 else 0 end) and( 1= case when ('%s' = 'None' or '%s' = '') then 1 when '%s' <> 'None' and -- ((CURRENT_DATE - TO_DATE(s, 'MM/DD/YYYY')) (DATEDIFF(day, CONVERT(VARCHAR, '%s', 101), CONVERT(VARCHAR, getdate(), 101)) <= keyc.days_to_disease or keyc.days_to_disease is null) then 1 else 0 end) ''' % (age, age, age, age, domain, domain, domain, domain, exposure, exposure, exposure, stat, stat, stat, preg, preg, preg, preg, user_picked_time, user_picked_time, user_picked_time, user_picked_time) print(sql) cur.execute(sql) nctids = cur.fetchall() conn.close() cur.close() result_ids = [] if len(nctids) > 0: for nctid in nctids: result_ids.append([nctid[0], nctid[1]]) return result_ids
def find_new_question(question_answer_list, working_nct_id_list, domain='all'): ''' find new question by frequency. alternatively, information entropy should be considered sum(plog(p)) :param question_answer_list: questions already answered or skipped with their corresponding answers :param working_nct_id_list: a working nct id list :return: a updated question_answer_list by appending a new question Example working_nct_id_list = [['NCT02901717', 3431, 0], ['NCT01287182', 3432, 0],['NCT01035944', 3432, 0],['NCT00562068', 3431, 1], ['NCT00742300', 3431, 2]] question_answer_list = [{'answer': {}, 'question': (3, u'pregnant')}] ''' # working_nct_id_frame = pd.DataFrame(working_nct_id_list,columns=['nct_id', 'ctgov_rank', 'num_of_question']) working_nct_id_0 = [ record[0] for record in working_nct_id_list if record[3] == 0 ] working_nct_id_0_len = len(working_nct_id_0) placeholders1 = ",".join(str("'" + x + "'") for x in working_nct_id_0) ######################################################################################################################## # placeholders1 = ",".join("?" * 2000) # working_nct_id_0 = [record[0] for record in working_nct_id_list if record[2] == 0][0:2000] # ERROR is raised if the nct list is larger than 2000 # Use subsampling to solve this issue. # Select the first 2000 is better. if len(working_nct_id_0) > 2000: placeholders1 = ",".join("?" * 2000) working_nct_id_0 = working_nct_id_0[0:2000] ######################################################################################################################## domain = domain.lower() conn = general_pool_criteria.connection() cur = conn.cursor() if domain != 'all': table_name = 'dbo.all_criteria_v2' placeholders2 = str(domain) active_question_0 = [ qa['question']['entity_text'] for qa in question_answer_list if qa['question']['domain'] == domain ] print('active question: ' + str(active_question_0)) placeholders3 = ",".join(str("'" + x + "'") for x in active_question_0) if len(active_question_0) == 0: sql = ''' SELECT TOP(1) sum(PlogP) AS IE, concept_name FROM( select concept_name, include, count, -(count/%s)*LOG((count/%s)) AS PlogP FROM ( select CAST(count(distinct nct_id_original) AS [float]) AS count, concept_name, include from %s where nct_id_original in (%s) and concept_name is NOT NULL and lower(domain) = '%s' and to_display = 1 group by concept_name, include ) X ) X GROUP BY concept_name ORDER BY sum(X.PlogP) DESC ''' % (working_nct_id_0_len, working_nct_id_0_len, table_name, placeholders1, placeholders2) else: sql = ''' SELECT TOP(1) sum(PlogP) AS IE, concept_name FROM( select concept_name, include, count, -(count/%s)*LOG((count/%s)) AS PlogP FROM ( select CAST(count(distinct nct_id_original) AS [float]) AS count, concept_name, include from %s where nct_id_original in (%s) and concept_name is NOT NULL and lower(domain) = '%s' and concept_name not in (%s) and to_display = 1 group by concept_name, include ) X ) X GROUP BY concept_name ORDER BY sum(X.PlogP) DESC ''' % (working_nct_id_0_len, working_nct_id_0_len, table_name, placeholders1, placeholders2, placeholders3) cur.execute(sql) next_concept = cur.fetchall() conn.close() cur.close() if len(next_concept) > 0: this_q = { 'question': { 'domain': domain, 'entity_text': next_concept[0][1] } } else: this_q = {'question': {'domain': domain, 'entity_text': 'NQF'}} question_answer_list.append(this_q) else: table_name = 'dbo.all_criteria_v2' active_question_0 = [ qa['question']['entity_text'] for qa in question_answer_list ] placeholders2 = ", ".join( str("'" + x + "'") for x in active_question_0) if len(active_question_0) == 0: sql = ''' SELECT TOP(1) sum(PlogP) AS IE, concept_name, domain, include FROM( select concept_name, include, domain, count, -(count/%s)*LOG((count/%s)) AS PlogP FROM ( select CAST(count(distinct nct_id_original) AS [float]) AS count, concept_name, include, domain from %s where nct_id_original in (%s) and concept_name is NOT NULL and to_display = 1 group by concept_name, include, domain ) X ) X GROUP BY concept_name, domain, include ORDER BY sum(X.PlogP) DESC ''' % (working_nct_id_0_len, working_nct_id_0_len, table_name, placeholders1) else: sql = ''' SELECT TOP(1) sum(PlogP) AS IE, concept_name, domain, include FROM( select concept_name, include, domain, count, -(count/%s)*LOG((count/%s)) AS PlogP FROM ( select CAST(count(distinct nct_id_original) AS [float]) AS count, concept_name, include, domain from %s where nct_id_original in (%s) and concept_name is NOT NULL and concept_name not in (%s) and to_display = 1 group by concept_name, include, domain ) X ) X GROUP BY concept_name, domain, include ORDER BY sum(X.PlogP) DESC ''' % (working_nct_id_0_len, working_nct_id_0_len, table_name, placeholders1, placeholders2) cur.execute(sql) next_concept = cur.fetchall() conn.close() cur.close() if len(next_concept) > 0: this_q = { 'question': { 'domain': next_concept[0][2], 'entity_text': next_concept[0][1] } } else: this_q = {'question': {'domain': domain, 'entity_text': 'NQF'}} question_answer_list.append(this_q) return question_answer_list
import pyodbc import pandas as pd from DBUtils.PooledDB import PooledDB import googlemaps import pickle from app import general_pool_criteria # use the googlemaps pacakages to geocode locations # Please use the google map api key for geocoding services # fengyang's API key: '*******' gmap_api_key = '' gmaps = googlemaps.Client(key=gmap_api_key) conn = general_pool_criteria.connection() cur = conn.cursor() sql = ''' select nct_id, facilities_and_contacts from dbo.aact_trial_info ''' cur.execute(sql) trial_data = cur.fetchall() conn.close() cur.close() def split_locs_into_list(x): loc_list = [] if x is not None: loc_list = x.split('|') return loc_list i = 0
def filter_nct_ids_by_pre_questions(answer_list): ''' find working nct id list after filter the answers to pre-questions :param answer_list: the list of the answers to pre-questions :return: a working nct id list in our annotation list ''' answer_list = [str(x) for x in answer_list] age = answer_list[0] gender = answer_list[1] domain = answer_list[2] user_picked_time = answer_list[3] exposure = answer_list[4] stat = answer_list[5] preg = answer_list[6] # query database filter nctids conn = general_pool_criteria.connection() cur = conn.cursor() params = [ age, age, age, gender, gender, domain, domain, domain, domain, exposure, exposure, exposure, stat, stat, stat, preg, preg, preg, preg ] placeholder1 = ",".join("?" * len(params)) sql = ''' select distinct aact.nct_id from dbo.aact_trial_info_us as aact left outer join dbo.key_criteria as keyc on aact.nct_id = keyc.nct_id where (%s >= aact.minimum_age or aact.minimum_age is null) and (%s <= aact.maximum_age or aact.maximum_age is null) and (1= case when '%s' = 'male' and aact.gender in ('male', 'all') then 1 when '%s' = 'female' and aact.gender in ('female', 'all') then 1 when '%s' = 'other' and aact.gender in ('male', 'female', 'all') then 1 else 0 end) and (1= case when '%s' = 'yes' and keyc.disease_status in ('yes', 'all') then 1 when '%s' = 'no' and keyc.disease_status in ('no', 'all') then 1 when '%s' = 'cleared' and keyc.disease_status in ('cleared', 'all') then 1 when '%s' = 'all' and keyc.disease_status in ('yes','no','cleared', 'all') then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.exposure_status = 'yes' or keyc.exposure_status is null) then 1 when '%s' = 'no' and (keyc.exposure_status = 'no' or keyc.exposure_status is null) then 1 when '%s' = 'idk' and (keyc.exposure_status in ('yes', 'no') or keyc.exposure_status is null) then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.is_hospitalized = 1 or keyc.is_hospitalized is null) then 1 when '%s' = 'no' and (keyc.is_hospitalized = 0 or keyc.is_hospitalized is null) then 1 when '%s' = 'idk' and (keyc.is_hospitalized in (1, 0) or keyc.is_hospitalized is null) then 1 else 0 end) and (1= case when '%s' = 'yes' and (keyc.preg_status = 1 or keyc.preg_status is null) then 1 when ('%s' = 'no' or '%s' = 'n/a') and (keyc.preg_status = 0 or keyc.preg_status is null) then 1 when '%s' = 'idk' and (keyc.preg_status in (1, 0) or keyc.preg_status is null) then 1 else 0 end) and( 1= case when ('%s' = 'None' or '%s' = '') and keyc.days_to_disease is null then 1 when '%s' != 'None' and (DATEDIFF(day, CONVERT(VARCHAR, '%s', 101), CONVERT(VARCHAR, getdate(), 101)) <= keyc.days_to_disease or keyc.days_to_disease is null) then 1 else 0 end) ''' % (age, age, gender, gender, gender, domain, domain, domain, domain, exposure, exposure, exposure, stat, stat, stat, preg, preg, preg, preg, user_picked_time, user_picked_time, user_picked_time, user_picked_time) print(sql) cur.execute(sql) nctids = cur.fetchall() result_ids = [] if len(nctids) > 0: for nctid in nctids: result_ids.append(str(nctid[0])) print('after filter pre-questions: ', len(nctids), result_ids) conn.close() cur.close() return result_ids