def extract_scores(input_folder, output_file, study_id_var, subject_prefix, subjects=[], from_date=None, non_long=False): filename_format = r'(({}\d+)_(\w+).\w+)'.format(subject_prefix) # get all the converted score reports (capturing the filename and the visit type) all_files = [ re.search(filename_format, f, flags=re.IGNORECASE) for f in listdir(input_folder) ] all_files = [ result.groups() for result in all_files if result ] pdf_files = [ f for f in all_files if re.search('.pdf$', f[0], flags=re.IGNORECASE) ] index_cols = [ study_id_var, 'redcap_event_name', 'score_type'] results = [] # array of arrays of scores (raw/t/percentile) for each CBCL measure for file_info in pdf_files: result = {} if subjects and file_info[1] not in subjects: continue if from_date and datetime.fromtimestamp(stat(join(input_folder, file_info[0])).st_mtime) < from_date: continue root, ext = splitext(file_info[0]) csv_file = root + '.csv' if csv_file not in [ f[0] for f in all_files ]: print('No csv file: {}. Converting pdf to csv...'.format(csv_file)) call(['java', '-jar', TABULA_JAR, '-l', '-o', join(input_folder, csv_file), '--pages', 'all', join(input_folder, file_info[0])]) with open(join(input_folder, csv_file), 'r') as f: print('Processing file: {}'.format(csv_file)) reader = csv.reader(f) for row in reader: search_res = re.match('(Total Score|T Score|Percentile)', row[0]) if search_res: score_type = search_res.group(1) if score_type not in result: result[score_type] = [] result[score_type] += list(filter(None, row[1:])) ranges = [ 'clinical', 'borderline' ] for range in ranges: result[range] = [ int('-' + range[0].upper() in score) for score in result['T Score'] ] # check if -C or -B in in T score result['T Score'] = [ score.split('-')[0] for score in result['T Score'] ] for type in result.keys(): results.append([file_info[1], file_info[2], type] + result[type]) columns = index_cols + ALL_MEASURES if len(columns) - len(results[0]) == 4: print('Assuming that competence scale measures were not collected....') columns = [ col for col in columns if col in index_cols or col not in COMP_SCALE1 + COMP_SCALE2 ] df = pd.DataFrame(data=results, columns=columns).set_index(index_cols) df = df.rename(index={'Total Score': 'raw', 'T Score': 't', 'Percentile': 'per'}) df = df.replace('nc', np.nan).dropna(axis=1, how='all') # drop columns that are all NaN (not all studies collect competence scale data) df = redcap_common.flatten(df, sort=False, prefix='cbcl_') checkbox_cols = [ re.search(r'(cbcl_(?:borderline|clinical)_(\w+))', col) for col in df.columns ] checkbox_col_info = [ result.groups() for result in checkbox_cols if result ] checkbox_col_renames = { group[0]: '{}___{}'.format(group[0].rsplit('_', 1)[0], ALL_MEASURES.index(group[1])+1) for group in checkbox_col_info } df = df.rename(columns=checkbox_col_renames) if non_long: df = redcap_common.flatten(df) # flatten again to add session prefix to columns if non-longitudinally stored df.to_csv(output_file)
def redcap2spss(input_file, output_file): df = redcap_common.create_df(input_file) if df.shape[0] == len( df.iloc[:, 0].unique() ): # if the row count is the same as the unique indentifiers, assume spss to redcap # get columns that have session as a suffix and make it a prefix instead (this is the format expand expects) suffixed_cols = { col: '_'.join([col[-2:], col[:-3]]) for col in df.columns if re.search(r'_s\d$', col) } if suffixed_cols: df = df.rename(columns=suffixed_cols) df, non_session_cols = redcap_common.expand(df.set_index( df.columns[0])) df = df.set_index(df.columns[0]) else: # assume redcap to spss prefix = 's' if df[df.columns[1]].dtype == 'int64' else '' df = redcap_common.flatten(df.set_index([df.columns[0], df.columns[1]]), sort=True, prefix=prefix) redcap_common.write_results_and_open(df, output_file)
def gen_import_file(datafile, varfile, study_name, form_type, flatten=False): df = pd.read_excel(datafile) change_df = pd.read_csv(varfile) # Remove irrelevant columns drop_cols = [ col for col in df.columns if col not in change_df['aseba_var'].values ] df = df.drop(columns=drop_cols) # Determine clinical/borderline checkbox values for (var, trange) in CHECKBOX_MAP.items(): if not var in change_df['aseba_var'].values: continue tscore_col = var.split('.')[0] min, max = trange df[var] = df[tscore_col].apply(lambda x: int(x >= min and x <= max)) # Rename columns to match REDCap variables df = df.rename( columns={ row['aseba_var']: row['redcap_var'] for _, row in change_df.iterrows() if pd.notnull(row['redcap_var']) }) # Assign value to static columns that need to be present assign_col_df = change_df[pd.notnull(change_df['fill_value'])] for _, row in assign_col_df.iterrows(): df[row['redcap_var']] = row['fill_value'] # Extract study_id/redcap_event_name and rename study_id_col = change_df.loc[change_df['aseba_var'] == ASEBA_ID].iloc[0]['redcap_var'] split_col_df = df[study_id_col].str.split('_', 1, expand=True) if len(split_col_df.columns) > 1: # assume longitudinal if multipart ID index_cols = [study_id_col, 'redcap_event_name'] df[index_cols] = split_col_df df = df.set_index(index_cols) else: # otherwise, just set ASEBA id to subject df = df.set_index(study_id_col) # Flatten dataframe for multi-session databases NOT in longitudinal format if flatten: df = redcap_common.flatten(df) outroot = os.path.splitext(datafile)[0] df.to_csv(outroot + '_import.csv')
def nih_toolbox_import(exports_folder, subjects): result = None exports = [f for f in listdir(exports_folder) if f.endswith('.csv')] for export in exports: print('Processing:', export) df = pd.read_csv(join(exports_folder, export)).dropna(how='all') if UNADJUSTED not in df.columns: continue df = df.rename(columns={ 'PIN': 'newt_id', 'RawScore': 'raw', 'TScore': 'tscore' }) parent_df = df[df['newt_id'].str.contains('parent', flags=re.IGNORECASE)] subject_df = df[~df['newt_id'].isin(parent_df['newt_id'])] if not parent_df.empty: parent_df = replace_variables(parent_df, parent_vars) parent_df = extract_id_and_session(parent_df) else: parent_df = None if not subject_df.empty: subject_df = subject_df.groupby('newt_id').apply( lambda x: replace_variables(x, subject_vars) if len(x) > 4 else replace_variables(x, parent_vars)) subject_df = extract_id_and_session(subject_df) else: subject_df = None export_df = pd.concat([subject_df, parent_df], axis=0) result = pd.concat([result, export_df], axis=0) print('Formatting...') if subjects: result = result[result['newt_id'].isin(subjects)] result.rename(columns={ UNADJUSTED: 'unadj', ADJUSTED: 'ageadj' }, inplace=True) result = result.drop_duplicates(subset=['newt_id', 'Inst', 'unadj'], keep='last') result.set_index(['newt_id', 'session_number', 'Inst'], inplace=True) score_cols = ['raw', 'tscore', 'unadj', 'ageadj'] result = result[score_cols] result = result.dropna(how='all', subset=score_cols) result = redcap_common.flatten(redcap_common.flatten(result)) # perform renames - no session numbers and changes suffixes for parent iq columns, cog_crystal order change for ageadj column, # remove raw suffix for self_neuroqol column result.rename(columns={ col: col.replace('unadj', 'unc').replace('ageadj', 'ac') for col in result.columns if 'parent' in col }, inplace=True) result.rename(columns={ col: col.replace('crystal_cog', 'cog_crystal') for col in result.columns if 'crystal_cog_ageadj' in col }, inplace=True) result.rename(columns={ col: col[3:] for col in result.columns if 'parent' in col and 'neuroqol' not in col }, inplace=True) result.rename(columns={ col: col[:-4] for col in result.columns if 'self_neuroqol_raw' in col }, inplace=True) drop_cols = [ col for col in result.columns if not 'neuroqol' in col and re.match(r'\w*_(raw|tscore)$', col) ] result = result.drop(drop_cols, axis=1) redcap_common.write_results_and_open(result, 'nih_result.csv') return
def format_track_data(): # set up expected arguments and associated help text parser = GooeyParser( description='Formats TRACK data from REDCap csv export') required = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1}) required.add_argument('--input_file', required=True, widget='FileChooser', help='REDCap export file') required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in') required.add_argument('--api_password', required=True, widget='PasswordField', help='Password to access API token') optional = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 2}) optional.add_argument( '-c', '--consecutive', type=int, metavar='num_consecutive_years', help= 'Limit results to particpants with data for a number of consecutive years' ) optional.add_argument('-d', '--duration', action='store_true', help='Calculate diabetes diagnosis duration') variable_options = parser.add_argument_group( 'Variable options', 'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export', gooey_options={ 'columns': 1, 'show_border': True }) variable_options.add_argument( '--all', nargs='+', default=None, help= 'All specified data points required for participant to be included in result' ) variable_options.add_argument( '--any', nargs='+', default=None, help= 'At least one specified data point required for participant to be included in result' ) format_options = parser.add_argument_group('Formatting options', gooey_options={ 'columns': 2, 'show_border': True }) format_options.add_argument( '-e', '--expand', action='store_true', help='Arrange data with one row per subject per session') format_options.add_argument('-t', '--transpose', action='store_true', help='Transpose the data') args = parser.parse_args() if not args.input_file.endswith('.csv') or not args.output_file.endswith( '.csv'): parser.error('Input and output files must be of type csv') # create initial dataframe structure df = redcap_common.create_df(args.input_file) df = df[df[TRACK_STUDY_ID].str.contains(r'TRACK\d+')] # remove test rows project = None if any(arg is not None for arg in [args.all, args.any, args.duration, args.consecutive]): project = redcap_common.get_redcap_project('track', args.api_token) if args.all: df = redcap_common.check_for_all(df, args.all, project) if args.any: df = redcap_common.check_for_any(df, args.any, project) fields = None if args.duration or args.consecutive: fields = redcap_common.get_matching_columns( project.field_names, r'\w*(' + '|'.join(DURATION_FIELDS) + ')') df = redcap_common.merge_api_data(df, project, fields, [TRACK_STUDY_ID]) # expand/rename after api merge to ensure column names match up df, non_session_cols = redcap_common.expand(df.set_index(TRACK_STUDY_ID)) df = redcap_common.rename_common_columns(df, RENAMES, False) df[redcap_common.SESSION_DATE] = pd.to_datetime( df[redcap_common.SESSION_DATE]) df = df[pd.notnull(df[ redcap_common.SESSION_DATE])] # remove rows for non-attended sessions if args.duration: df = redcap_common.prepare_age_calc(df) dx_vars = {'dx_date': 'db_dx_date', 'dx_age': 'db_onset_age'} df['db_dx_date'] = pd.to_datetime(df['db_dx_date']) dx_age_df = df.loc[df[redcap_common.SESSION_NUMBER] == 's1'].apply( redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1) df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.calculate_diagnosis_duration, 'db', dx_age_df) df = df.drop('session_age', axis=1) if args.consecutive: df[redcap_common.SESSION_YEAR] = df[redcap_common.SESSION_DATE].apply( lambda x: x.year if x else None) df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.get_consecutive_years, args.consecutive) df = df.drop([redcap_common.SESSION_YEAR], axis=1) df = redcap_common.rename_common_columns( df, RENAMES, True) # rename common columns back to original names pre-flattening df = df.set_index([TRACK_STUDY_ID, redcap_common.SESSION_NUMBER]) if not args.expand: non_session_cols = { col: 's1_' + col for col in df.columns if not re.match(r's\d_', col) } df = df.rename(columns=non_session_cols) df = redcap_common.flatten( df) # always reflatten at end, unless expand flag is set if args.transpose: df = df.transpose() # clean up dataframe revert_non_session_cols = { 's1_' + col: col for col in non_session_cols.keys() } df = df.rename(columns=revert_non_session_cols) if fields: drop_fields = fields if not args.expand else DURATION_FIELDS # if leaving expanded, then the columns we brought in don't match the current columns df = redcap_common.cleanup_api_merge(df, drop_fields) redcap_common.write_results_and_open(df, args.output_file)
def format_wolfram_data(): # set up expected arguments and associated help text parser = GooeyParser( description='Formats Wolfram data from REDCap csv export') required = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1}) required.add_argument('--input_file', required=True, widget='FileChooser', help='REDCap export file') # required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in') optional = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 1}) optional.add_argument( '-c', '--consecutive', type=int, metavar='num_consecutive_years', help= 'Limit results to particpants with data for a number of consecutive years' ) optional.add_argument( '-d', '--duration', nargs='*', dest='dx_types', widget='Listbox', default=None, choices=ALL_DX_TYPES, help='Calculate diagnosis duration for specified diagnosis types') # optional.add_argument('--duration-type', dest='duration_type', default='clinic date', choices=['clinic date','MRI date','MRI date if available, otherwise clinic date ("mri_or_clinic")'], help='Visit date to use when calculating dx durations') optional.add_argument( '--duration-type', dest='duration_type', default='clinic date', choices=['clinic date', 'MRI date'], help='Visit date to use when calculating dx durations') optional.add_argument( '--drop_non_mri', action='store_true', help='Drop all sessions that do not have an "mri_date" entry.') optional.add_argument( '--old-db', action='store_true', help='whether data was sourced from old Wolfram database') optional.add_argument( '--api_token', widget='PasswordField', help= 'REDCap API token (if not specified, will not pull anything from REDCap)' ) variable_options = parser.add_argument_group( 'Variable options', 'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export', gooey_options={ 'columns': 1, 'show_border': True }) variable_options.add_argument( '--all', nargs='+', default=None, help= 'All specified data points required for participant to be included in result' ) variable_options.add_argument( '--any', nargs='+', default=None, help= 'At least one specified data point required for participant to be included in result' ) format_options = parser.add_argument_group('Formatting options', gooey_options={ 'columns': 2, 'show_border': True }) format_options.add_argument( '-f', '--flatten', action='store_true', help='Arrange all session data in single row for participant') format_options.add_argument( '--flatten_by', default='session number', choices=['session number', 'clinic year'], help='Flatten data by session number or clinic year') format_options.add_argument('-t', '--transpose', action='store_true', help='Transpose the data') format_options.add_argument( '-s', '--sort_by', default='variable', choices=['variable', 'session'], help='Sort flattened data by session or variable') args = parser.parse_args() dur_label = '' flatten_label = '' mri_label = '' if not args.old_db: print( '### "old_db" not checked, only pulling data from the "new" database ###' ) if not args.input_file.endswith('.csv'): parser.error('Input file must be of type csv') # create dataframe from REDCap data df = redcap_common.create_df(args.input_file) df = df[df[WFS_STUDY_ID].str.contains( r'WOLF_\d{4}_.+')] # remove Test and Wolf_AN rows num_clinic_years = len( df['redcap_event_name'].unique() ) - 1 # FIXME: should be counting max number of sessions for participants (still may cause error because they might not be consecutive) print('### Number of clinic years detected = {} ###'.format( num_clinic_years)) # get number of subjects in dataframe num_subjects = len(df[WFS_STUDY_ID].unique()) print('### Number of subjects detected in {} = {} ###'.format( args.input_file, num_subjects)) # only create API project if actions require it and data needed is not already present, AND if API token is given project = None # check for fields missing from csv df fields = [ WFS_SESSION_NUMBER, WFS_CLINIC_YEAR ] if WFS_SESSION_NUMBER not in df.columns else [ ] # always need to get session number if not in data (used to determine which rows to keep) if MISSED_SESSION not in df.columns: fields.append( MISSED_SESSION ) # need missed_session var to remove rows for unattended session if args.dx_types is not None: for dx_type in args.dx_types: dx_age_field = get_dx_column(dx_type, 'best_age_calc') if dx_age_field not in df.columns: fields.append(dx_age_field) for non_dx_field in NON_DX_FIELDS_FOR_DURATION: if non_dx_field not in df.columns: fields.append(non_dx_field) if fields: # missing some fields, go get from REDCap print('### need to get some fields from REDCap ###') if args.api_token == "": raise RuntimeError( "Thre are missing fields in the input csv, so we need to get data from REDCap, but no API token is given. Ask Jon about REDCap API access." ) else: redcap_project_key = 'itrack' if not args.old_db else 'wolfram' project = project if project else redcap_common.get_redcap_project( redcap_project_key, args.api_token) df = redcap_common.merge_api_data( df, project, fields, [WFS_STUDY_ID, 'redcap_event_name']) # rename common columns after api merge to ensure column names match up df = redcap_common.rename_common_columns(df, RENAMES, False) if args.consecutive is not None and args.consecutive not in range( 2, num_clinic_years + 1): parser.error( 'Consecutive years must be greater than 1 and cannot exceed number of clinic years ({})' .format(num_clinic_years)) df.loc[(df['redcap_event_name'] == 'stable_patient_cha_arm_1'), [redcap_common.SESSION_NUMBER]] = df.loc[ (df['redcap_event_name'] == 'stable_patient_cha_arm_1'), [redcap_common.SESSION_NUMBER]].fillna(0) # remove rows for sessions not attended (will have a flag saying they did not attend) df = df[pd.notnull(df[redcap_common.SESSION_NUMBER])] df = df[pd.isnull(df[MISSED_SESSION])] df[redcap_common.SESSION_NUMBER] = df[redcap_common.SESSION_NUMBER].astype( int ) # once NANs are gone, we can cast as int (nicer for flatten display) # if duration argument specified, calculate diagnosis duration for types specified or all (if none specified) if args.dx_types is not None: # explicit None check because empty array is valid # this puts a 'session_age' field into the df using dob and session_date (where session_date is from clinic_date) df = redcap_common.prepare_age_calc(df) df = mri_age_calc(df) if args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")': df['mri_or_clinic_age'] = df.apply( lambda row: select_best_age(row), axis=1) for dx_type in args.dx_types: dx_vars = {'dx_age': get_dx_column(dx_type, 'best_age_calc')} # df[dx_vars['dx_date']] = pd.to_datetime(df[dx_vars['dx_date']], errors='coerce') dx_age_df = df.loc[df['redcap_event_name'] == 'stable_patient_cha_arm_1'].apply( redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1) if args.duration_type == 'clinic date': dur_label = '_clinic_duration' dx_type_clinic = '_'.join([dx_type, 'clinic']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_clinic, dx_age_df, 'session_age') dx_dur_field = get_dx_column(dx_type, 'clinic_duration') df.loc[~(df[dx_dur_field] > 0), dx_dur_field] = np.nan elif args.duration_type == 'MRI date': dur_label = '_mri_duration' dx_type_mri = '_'.join([dx_type, 'mri']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_mri, dx_age_df, 'mri_age') dx_mri_dur_field = get_dx_column(dx_type, 'mri_duration') df.loc[~(df[dx_mri_dur_field] > 0), dx_mri_dur_field] = np.nan elif args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")': dur_label = '_mri_or_clinic_duration' dx_type_mri_or_clinic = '_'.join([dx_type, 'mri_or_clinic']) df = df.groupby([redcap_common.STUDY_ID]).apply( redcap_common.calculate_diagnosis_duration, dx_type_mri_or_clinic, dx_age_df, 'mri_or_clinic_age') dx_best_dur_field = get_dx_column(dx_type, 'mri_or_clinic_duration') df.loc[~(df[dx_best_dur_field] > 0), dx_best_dur_field] = np.nan else: raise Exception( "ERROR: dx_types chosen, but no duration_type chosen") # df = df.drop(['session_age', 'redcap_event_name'], axis=1) # if varaibles are specified, filter out rows that don't have data for them (if null or non-numeric) if args.all: df = redcap_common.check_for_all(df, args.all, project, True) if args.any: df = redcap_common.check_for_any(df, args.any, project, True) # remove session data for participants that did not occur in consecutive years if args.consecutive: df = df.groupby([redcap_common.STUDY_ID ]).apply(redcap_common.get_consecutive_years, args.consecutive) if df.empty: stderr.write( 'No data to return. Selections have filtered out all rows.') exit(1) # add clinic_year df['clinic_year'] = df.apply(lambda row: get_clinic_year(row), axis=1) # rename common columns back to original names df = redcap_common.rename_common_columns(df, RENAMES, True) # if we have brought in dx info/demographics from the API, remove it after the calculation and rename columns that were suffixed due to merge if not fields == [ WFS_SESSION_NUMBER, WFS_CLINIC_YEAR ] and args.api_token: # don't need to go through deletion logic if only field is session number if WFS_SESSION_NUMBER in fields: fields.remove( WFS_SESSION_NUMBER) # remove session number from fields df = redcap_common.cleanup_api_merge(df, fields) # rename session_age to clinic_age df = df.rename(columns={"session_age": "clinic_age"}) # remove dob, clinic date and MRI date df = df.drop(['dob'], axis=1, errors="ignore") df = df.drop(['clinic_date'], axis=1, errors="ignore") df = df.drop(['mri_date'], axis=1, errors="ignore") df = df.drop(['redcap_event_name'], axis=1, errors="ignore") # drop non-MRI sessions if args.drop_non_mri: df = df[(df[MRI_AGE] > 0.0) | (df['clinic_year'] == 0)] mri_label = '_just_mri' # df.to_csv(r'C:\temp\df_before_flatten.csv') # puts all sessions/clinic years for a participant on one line (suffixed with year/session) if args.flatten: # multi-index column for flattening if args.flatten_by == 'session number': flatten_by_column = 'wolfram_sessionnumber' flatten_label = '_flattened_by_session' # df.set_index([redcap_common.STUDY_ID, redcap_common.SESSION_NUMBER], inplace=True) flatten_group_prefix = 's' elif args.flatten_by == 'clinic year': flatten_by_column = 'clinic_year' flatten_label = '_flattened_by_clinic' # df.set_index([redcap_common.STUDY_ID, 'clinic_year'], inplace=True) flatten_group_prefix = 'c' else: raise Exception('ERROR: flatten_by check failed') sort = args.sort_by == 'session' df = redcap_common.flatten(df, flatten_by_column, sort, flatten_group_prefix) if args.transpose: df = df.transpose() # df.to_csv(r'C:\temp\df_right_before_save.csv') # make output_file name output_file = args.input_file.replace( '.csv', '{}{}{}.csv'.format(dur_label, flatten_label, mri_label)) redcap_common.write_results_and_open(df, output_file)