def test_get_visits(): udb.set_host("staging") # get N visits visit_df = udb.get_visits(n=13) assert len(visit_df) == 13 # get specific visits: test_visit_ids = [ '1b3cf7c8-e45c-41f3-87c3-c416c78f6305', 'be6b4716-db1b-42bc-b614-7982f4614deb', '90a88ed5-0bf5-4ffd-b46c-91720d868cef', '98315020-68a4-46e6-b07f-599acf0c507d', '8e0444d4-9b86-47b4-a02c-d0df06005441', '837edd81-4b82-4111-920b-46e2e2726aeb', 'fa65a8a0-425f-4cc4-9c75-a873627248c1', 'bded41a8-d478-4f96-9706-30a833e431b1', '52929e2f-fa57-4e5a-a8aa-c2b890f4c200', '971a1e47-dc29-48dd-b6fd-6ee3172fcb5c', '20f61c59-0312-4cd5-b4c3-df34b5cfe204', 'cfd1f60b-5b3d-442b-a115-33b5023389c8', '417a7d6e-75c7-4211-be0c-4d07a0f30258' ] test_subject_ids = [ '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', 'b1d62c65-c6bb-4839-ac57-59ab4ea73fc7', '66999214-048e-4a41-a51b-4e8197c2e668', '66999214-048e-4a41-a51b-4e8197c2e668', '675bcea5-2bbd-4cdd-b2b3-abae53b33a80', '66999214-048e-4a41-a51b-4e8197c2e668' ] visit_df = udb.get_visits(visit_id=test_visit_ids) assert list(visit_df.visit_id.values) == test_visit_ids assert list(visit_df.subject_id.values) == test_subject_ids
def test_get_visit_exams(): # test the various elements needed for app.py print("test loading visits") visit_id = "4347d97f-cd56-4865-a505-3506246e9ed7" expected_exams = [ 'TrailMaking2', 'CategoryFluency', 'LetterFluency', 'TrailMaking', 'Prosaccade' ] visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) # check that all 7 visit_exams get returned assert len(visit_exams_df) == 5 # check that they are the correct ones assert all([w in expected_exams for w in visit_exams_df['exam']])
def concatenate_history(subj_series): """ Get all the visit_exams for a specific user & exam combination :param subj_series: :return: """ # todo: check get_visits is working with user_id here visits = udb.get_visits(subject_id=subj_series['subject_id']) visit_exams_df = udb.get_visit_exams(visits) visit_exams_df.dropna(subset=['exam'], inplace=True) filter_exam = visit_exams_df.exam == subj_series['exam'] visit_exams_df = visit_exams_df[filter_exam] return visit_exams_df
def get_visit_exams_per_user(one_per_user=True): """ Get the most recent exam of each type per user :return: """ visit_exams_df = utils_db.get_visit_exams(utils_db.get_visits(n=np.inf)) # remove anyone who is a test subject test_subjects = utils_db.get_test_subjects() visit_exams_df = visit_exams_df[visit_exams_df['subject_id'].apply( lambda w: w not in test_subjects)] # pick just the most recent exam for each visit exam type for each subject # sort the exams by date visit_exams_df = visit_exams_df.sort_values('created_date_visit', axis=0) # jut pick the last exam that each user took if one_per_user: visit_exams_df = visit_exams_df.groupby(['subject_id', 'exam']).last().reset_index() return visit_exams_df
def concatenate_control(subj_series, n=CONTROL_SUBJECT_QTY, year_range=20, max_controls=False): """ :param subj_series: the series for the visit_exam we are looking to get controls for :param n: maximum number of controls :param year_range: +- year range around which control subjects will be taken :param max_controls: True if there are no restrictions on who is a control - except that it not be the subject :return: """ visits = udb.get_visits(n=np.inf) # todo: test getting a single exam in the query visit_exams_df = udb.get_visit_exams(visits) visit_exams_df.dropna(subset=['exam'], inplace=True) visit_exams_df.dropna(subset=['exam_version'], inplace=True) # just the visit_exams with our exam filter_exam = visit_exams_df.exam == subj_series.exam # just the visit_exams *not including any from our subject filter_username = np.invert( visit_exams_df.subject_id == subj_series.subject_id) # do the filter step visit_exams_df = visit_exams_df[filter_exam & filter_username] if max_controls: control_df = visit_exams_df else: if (subj_series['date_of_birth'] is None) or (subj_series['gender'] is None): # do not return any controls until we know the person's date of birth control_df = pd.DataFrame() logger.warning("missing DOB or Gender for: " + str(subj_series['subject_id'])) else: visit_exams_df.dropna(subset=['date_of_birth'], inplace=True) visit_exams_df.dropna(subset=['gender'], inplace=True) # filter filter_gender = visit_exams_df.gender == subj_series.gender # calculate age of subj_series and filter based on year_range subj_series_age = subj_series['created_date_visit'] - subj_series[ 'date_of_birth'] visit_exams_df['age'] = visit_exams_df[ 'created_date_visit'] - visit_exams_df['date_of_birth'] filter_age = (visit_exams_df.age > subj_series_age - datetime.timedelta(days=year_range * 365.25)) & \ (visit_exams_df.age < subj_series_age + datetime.timedelta(days=year_range * 365.25)) visit_exams_df = visit_exams_df[filter_gender & filter_age] # drop the age column once we're done using so nothing farther downstream breaks visit_exams_df.drop(['age'], axis=1, inplace=True) control_df = visit_exams_df.iloc[:n] control_and_subject_df = control_df.append(subj_series) return control_and_subject_df
def get_processed_df_for_visit(visit_id, params={}): visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) processed_df = udb.load_visit_exams_processed(visit_exams_df, params) return processed_df
def test_load_processed(): print("testing load processed") test_begin_time = datetime.datetime.now(datetime.timezone.utc) visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188" # todo: add each exam as it is finished visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) # filter out any we know haven't passed the test yet # todo: remove this filter because everything should pass the test exams_to_test = [ 'TrailMaking', 'TrailMaking2', 'CategoryFluency', 'LetterFluency' ] #, 'Prosaccade'] visit_exams_df = visit_exams_df[visit_exams_df['exam'].apply( lambda w: w in exams_to_test)] # note: host is AWS so that this doesn't leave a mess params = { 'videos': False, 'host': 'aws', 'control_subj_quantity': 2, 'exams': exams_to_test, 'overwrite': True } print("Test recomputing results") processed_df = udb.load_visit_exams_processed(visit_exams_df, params) # Check that all the status passed for idx, subj_series in processed_df.iterrows(): # check that each exam assert 'has_error' in subj_series, "missing error status field" assert subj_series[ 'has_error'] == False, "error occured in: " + subj_series['exam'] assert 'has_error' in subj_series[ 'processed'], "missing status element in exam: " + subj_series[ 'exam'] # check that the status is true assert subj_series['processed']['has_error'] == False, "processing has error in exam: " + subj_series['exam'] +\ ", has_error: " + subj_series['processed']['has_error'] # test that the processed files have all been modified by that process for idx, subj_series in processed_df.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] > test_begin_time, "overwrite: True, processed file was not modified when it should have been for exam: " + subj_series[ 'exam'] # tests below measure a few cases of the overwrite_now function # check that the files don't get reprocessed again if param is set to false test_begin_time = datetime.datetime.now(datetime.timezone.utc) params['overwrite'] = False processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] < test_begin_time, "overwrite:'False' processed file was modified when it shouldn't have been for exam: " + subj_series[ 'exam'] # test batch overwrite: the files don't get reprocessed again for a long-running batch params['overwrite'] = 'batch' # pretend the batch started 2 hours ago test_begin_time = datetime.datetime.now(datetime.timezone.utc) params['batch_begin_time'] = test_begin_time - datetime.timedelta(hours=2) processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response[ 'LastModified'] < test_begin_time, "overwrite:'batch' processed file was modified when it shouldn't have been for exam: " + subj_series[ 'exam'] # Test batch overwrite: the files *do* get reprocessed if older than when the batch started params['batch_begin_time'] = datetime.datetime.now(datetime.timezone.utc) processed_df2 = udb.load_visit_exams_processed(visit_exams_df, params) for idx, subj_series in processed_df2.iterrows(): response = udb.load_s3_object(udb.get_processed_path(subj_series)) assert response['LastModified'] > params[ 'batch_begin_time'], "overwrite:'batch' processed file was not modified when it should have been for exam: " + subj_series[ 'exam']
def test_processing_functions(): """ test results of the processing functions :return: """ visit_id = "6feeef57-4047-4c2d-b5f1-7e02f60a0188" # todo: add each exam as it is finished exams_to_test = [ 'TrailMaking2', 'CategoryFluency', 'LetterFluency', 'TrailMaking' ] #, 'Prosaccade'] # note: assumption is that the processing was already re-done in the earlier test function. params = { 'videos': False, 'host': 'aws', 'control_subj_quantity': 2, 'exams': exams_to_test, 'overwrite': False } visit_exams_df = udb.get_visit_exams(udb.get_visits(visit_id=visit_id)) processed_df = udb.load_visit_exams_processed(visit_exams_df, params) # filter out any we know haven't passed the test yet # todo: remove this filter because everything should pass the test processed_df = processed_df[processed_df['exam'].apply( lambda w: w in exams_to_test)] for idx, subj_series in processed_df.iterrows(): if subj_series['exam'] == 'TrailMaking': assert subj_series['processed']['metrics'][ 'error_count'] == 13, "trailmaking produced wrong error count" assert subj_series['processed']['metrics'][ 'repeat_count'] == 14, "trailmaking produced wrong repeat count" assert subj_series['processed'][ 'active time'] == 22.0552, "trailmaking produced wrong active time" elif subj_series['exam'] == 'TrailMaking2': assert subj_series['processed']['metrics'][ 'error_count'] == 16, "trailmaking2 produced wrong error count" assert subj_series['processed']['metrics'][ 'repeat_count'] == 11, "trailmaking2 produced wrong repeat count" assert subj_series['processed'][ 'active time'] == 19.5775, "trailmaking2 produced wrong active time" elif subj_series['exam'] == 'CategoryFluency': assert subj_series['processed']['data']['responses'].iloc[0][ 'transcript'] == 'cucumbers', "transcript first word incorrect" assert subj_series['processed']['data']['responses'].iloc[1][ 'transcript'] == 'carrots', "transcript second word incorrect" assert subj_series['processed']['data']['responses'].iloc[2][ 'transcript'] == 'celery', "transcript second word incorrect" assert subj_series['processed']['metrics'][ 'num_correct'] == 6, "number correct wrong" elif subj_series['exam'] == 'LetterFluency': assert subj_series['processed']['data']['responses'].iloc[0][ 'transcript'] == 'apples' assert subj_series['processed']['data']['responses'].iloc[1][ 'transcript'] == 'asparagus' assert subj_series['processed']['data']['responses'].iloc[2][ 'transcript'] == 'australia' assert subj_series['processed']['metrics']['num_correct'] == 4