예제 #1
0
def extract_scores(input_folder, output_file, study_id_var, subject_prefix, subjects=[], from_date=None, non_long=False):
	filename_format = r'(({}\d+)_(\w+).\w+)'.format(subject_prefix) # get all the converted score reports (capturing the filename and the visit type)
	all_files = [ re.search(filename_format, f, flags=re.IGNORECASE) for f in listdir(input_folder) ]
	all_files = [ result.groups() for result in all_files if result ]
	pdf_files = [ f for f in all_files if re.search('.pdf$', f[0], flags=re.IGNORECASE) ]

	index_cols = [ study_id_var, 'redcap_event_name', 'score_type']
	results = [] # array of arrays of scores (raw/t/percentile) for each CBCL measure
	for file_info in pdf_files:
		result = {}
		if subjects and file_info[1] not in subjects:
			continue
		if from_date and datetime.fromtimestamp(stat(join(input_folder, file_info[0])).st_mtime) < from_date:
			continue

		root, ext = splitext(file_info[0])
		csv_file = root + '.csv'
		if csv_file not in [ f[0] for f in all_files ]:
			print('No csv file: {}. Converting pdf to csv...'.format(csv_file))
			call(['java', '-jar', TABULA_JAR, '-l', '-o', join(input_folder, csv_file), '--pages', 'all', join(input_folder, file_info[0])])

		with open(join(input_folder, csv_file), 'r') as f:
			print('Processing file: {}'.format(csv_file))
			reader = csv.reader(f)
			for row in reader:
				search_res = re.match('(Total Score|T Score|Percentile)', row[0])
				if search_res:
					score_type = search_res.group(1)
					if score_type not in result:
						result[score_type] = []
					result[score_type] += list(filter(None, row[1:]))

		ranges = [ 'clinical', 'borderline' ]
		for range in ranges:
			result[range] = [ int('-' + range[0].upper() in score) for score in result['T Score'] ] # check if -C or -B in in T score

		result['T Score'] = [ score.split('-')[0] for score in result['T Score'] ]
		for type in result.keys():
			results.append([file_info[1], file_info[2], type] + result[type])

	columns = index_cols + ALL_MEASURES
	if len(columns) - len(results[0]) == 4:
		print('Assuming that competence scale measures were not collected....')
		columns = [ col for col in columns if col in index_cols or col not in COMP_SCALE1 + COMP_SCALE2 ]

	df = pd.DataFrame(data=results, columns=columns).set_index(index_cols)
	df = df.rename(index={'Total Score': 'raw', 'T Score': 't', 'Percentile': 'per'})
	df = df.replace('nc', np.nan).dropna(axis=1, how='all') # drop columns that are all NaN (not all studies collect competence scale data)
	df = redcap_common.flatten(df, sort=False, prefix='cbcl_')

	checkbox_cols = [ re.search(r'(cbcl_(?:borderline|clinical)_(\w+))', col) for col in df.columns ]
	checkbox_col_info = [ result.groups() for result in checkbox_cols if result ]
	checkbox_col_renames = { group[0]: '{}___{}'.format(group[0].rsplit('_', 1)[0], ALL_MEASURES.index(group[1])+1) for group in checkbox_col_info }
	df = df.rename(columns=checkbox_col_renames)

	if non_long:
		df = redcap_common.flatten(df) # flatten again to add session prefix to columns if non-longitudinally stored

	df.to_csv(output_file)
예제 #2
0
def redcap2spss(input_file, output_file):
    df = redcap_common.create_df(input_file)

    if df.shape[0] == len(
            df.iloc[:, 0].unique()
    ):  # if the row count is the same as the unique indentifiers, assume spss to redcap
        # get columns that have session as a suffix and make it a prefix instead (this is the format expand expects)
        suffixed_cols = {
            col: '_'.join([col[-2:], col[:-3]])
            for col in df.columns if re.search(r'_s\d$', col)
        }
        if suffixed_cols:
            df = df.rename(columns=suffixed_cols)

        df, non_session_cols = redcap_common.expand(df.set_index(
            df.columns[0]))
        df = df.set_index(df.columns[0])
    else:  # assume redcap to spss
        prefix = 's' if df[df.columns[1]].dtype == 'int64' else ''
        df = redcap_common.flatten(df.set_index([df.columns[0],
                                                 df.columns[1]]),
                                   sort=True,
                                   prefix=prefix)

    redcap_common.write_results_and_open(df, output_file)
예제 #3
0
def gen_import_file(datafile, varfile, study_name, form_type, flatten=False):
    df = pd.read_excel(datafile)
    change_df = pd.read_csv(varfile)

    # Remove irrelevant columns
    drop_cols = [
        col for col in df.columns if col not in change_df['aseba_var'].values
    ]
    df = df.drop(columns=drop_cols)

    # Determine clinical/borderline checkbox values
    for (var, trange) in CHECKBOX_MAP.items():
        if not var in change_df['aseba_var'].values:
            continue
        tscore_col = var.split('.')[0]
        min, max = trange
        df[var] = df[tscore_col].apply(lambda x: int(x >= min and x <= max))

    # Rename columns to match REDCap variables
    df = df.rename(
        columns={
            row['aseba_var']: row['redcap_var']
            for _, row in change_df.iterrows() if pd.notnull(row['redcap_var'])
        })

    # Assign value to static columns that need to be present
    assign_col_df = change_df[pd.notnull(change_df['fill_value'])]
    for _, row in assign_col_df.iterrows():
        df[row['redcap_var']] = row['fill_value']

    # Extract study_id/redcap_event_name and rename
    study_id_col = change_df.loc[change_df['aseba_var'] ==
                                 ASEBA_ID].iloc[0]['redcap_var']
    split_col_df = df[study_id_col].str.split('_', 1, expand=True)
    if len(split_col_df.columns) > 1:  # assume longitudinal if multipart ID
        index_cols = [study_id_col, 'redcap_event_name']
        df[index_cols] = split_col_df
        df = df.set_index(index_cols)
    else:  # otherwise, just set ASEBA id to subject
        df = df.set_index(study_id_col)

    # Flatten dataframe for multi-session databases NOT in longitudinal format
    if flatten:
        df = redcap_common.flatten(df)

    outroot = os.path.splitext(datafile)[0]
    df.to_csv(outroot + '_import.csv')
def nih_toolbox_import(exports_folder, subjects):
    result = None
    exports = [f for f in listdir(exports_folder) if f.endswith('.csv')]
    for export in exports:
        print('Processing:', export)
        df = pd.read_csv(join(exports_folder, export)).dropna(how='all')
        if UNADJUSTED not in df.columns:
            continue

        df = df.rename(columns={
            'PIN': 'newt_id',
            'RawScore': 'raw',
            'TScore': 'tscore'
        })
        parent_df = df[df['newt_id'].str.contains('parent',
                                                  flags=re.IGNORECASE)]
        subject_df = df[~df['newt_id'].isin(parent_df['newt_id'])]

        if not parent_df.empty:
            parent_df = replace_variables(parent_df, parent_vars)
            parent_df = extract_id_and_session(parent_df)
        else:
            parent_df = None

        if not subject_df.empty:
            subject_df = subject_df.groupby('newt_id').apply(
                lambda x: replace_variables(x, subject_vars)
                if len(x) > 4 else replace_variables(x, parent_vars))
            subject_df = extract_id_and_session(subject_df)
        else:
            subject_df = None

        export_df = pd.concat([subject_df, parent_df], axis=0)
        result = pd.concat([result, export_df], axis=0)

    print('Formatting...')

    if subjects:
        result = result[result['newt_id'].isin(subjects)]
    result.rename(columns={
        UNADJUSTED: 'unadj',
        ADJUSTED: 'ageadj'
    },
                  inplace=True)
    result = result.drop_duplicates(subset=['newt_id', 'Inst', 'unadj'],
                                    keep='last')
    result.set_index(['newt_id', 'session_number', 'Inst'], inplace=True)
    score_cols = ['raw', 'tscore', 'unadj', 'ageadj']
    result = result[score_cols]

    result = result.dropna(how='all', subset=score_cols)
    result = redcap_common.flatten(redcap_common.flatten(result))

    # perform renames - no session numbers and changes suffixes for parent iq columns, cog_crystal order change for ageadj column,
    #	remove raw suffix for self_neuroqol column
    result.rename(columns={
        col: col.replace('unadj', 'unc').replace('ageadj', 'ac')
        for col in result.columns if 'parent' in col
    },
                  inplace=True)
    result.rename(columns={
        col: col.replace('crystal_cog', 'cog_crystal')
        for col in result.columns if 'crystal_cog_ageadj' in col
    },
                  inplace=True)
    result.rename(columns={
        col: col[3:]
        for col in result.columns if 'parent' in col and 'neuroqol' not in col
    },
                  inplace=True)
    result.rename(columns={
        col: col[:-4]
        for col in result.columns if 'self_neuroqol_raw' in col
    },
                  inplace=True)

    drop_cols = [
        col for col in result.columns
        if not 'neuroqol' in col and re.match(r'\w*_(raw|tscore)$', col)
    ]
    result = result.drop(drop_cols, axis=1)

    redcap_common.write_results_and_open(result, 'nih_result.csv')

    return
def format_track_data():
    # set up expected arguments and associated help text
    parser = GooeyParser(
        description='Formats TRACK data from REDCap csv export')

    required = parser.add_argument_group('Required Arguments',
                                         gooey_options={'columns': 1})
    required.add_argument('--input_file',
                          required=True,
                          widget='FileChooser',
                          help='REDCap export file')
    required.add_argument('--output_file',
                          required=True,
                          widget='FileChooser',
                          help='CSV file to store formatted data in')
    required.add_argument('--api_password',
                          required=True,
                          widget='PasswordField',
                          help='Password to access API token')

    optional = parser.add_argument_group('Optional Arguments',
                                         gooey_options={'columns': 2})
    optional.add_argument(
        '-c',
        '--consecutive',
        type=int,
        metavar='num_consecutive_years',
        help=
        'Limit results to particpants with data for a number of consecutive years'
    )
    optional.add_argument('-d',
                          '--duration',
                          action='store_true',
                          help='Calculate diabetes diagnosis duration')

    variable_options = parser.add_argument_group(
        'Variable options',
        'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export',
        gooey_options={
            'columns': 1,
            'show_border': True
        })
    variable_options.add_argument(
        '--all',
        nargs='+',
        default=None,
        help=
        'All specified data points required for participant to be included in result'
    )
    variable_options.add_argument(
        '--any',
        nargs='+',
        default=None,
        help=
        'At least one specified data point required for participant to be included in result'
    )

    format_options = parser.add_argument_group('Formatting options',
                                               gooey_options={
                                                   'columns': 2,
                                                   'show_border': True
                                               })
    format_options.add_argument(
        '-e',
        '--expand',
        action='store_true',
        help='Arrange data with one row per subject per session')
    format_options.add_argument('-t',
                                '--transpose',
                                action='store_true',
                                help='Transpose the data')

    args = parser.parse_args()

    if not args.input_file.endswith('.csv') or not args.output_file.endswith(
            '.csv'):
        parser.error('Input and output files must be of type csv')

    # create initial dataframe structure
    df = redcap_common.create_df(args.input_file)
    df = df[df[TRACK_STUDY_ID].str.contains(r'TRACK\d+')]  # remove test rows

    project = None
    if any(arg is not None
           for arg in [args.all, args.any, args.duration, args.consecutive]):
        project = redcap_common.get_redcap_project('track', args.api_token)

    if args.all:
        df = redcap_common.check_for_all(df, args.all, project)

    if args.any:
        df = redcap_common.check_for_any(df, args.any, project)

    fields = None
    if args.duration or args.consecutive:
        fields = redcap_common.get_matching_columns(
            project.field_names, r'\w*(' + '|'.join(DURATION_FIELDS) + ')')
        df = redcap_common.merge_api_data(df, project, fields,
                                          [TRACK_STUDY_ID])

    # expand/rename after api merge to ensure column names match up
    df, non_session_cols = redcap_common.expand(df.set_index(TRACK_STUDY_ID))
    df = redcap_common.rename_common_columns(df, RENAMES, False)

    df[redcap_common.SESSION_DATE] = pd.to_datetime(
        df[redcap_common.SESSION_DATE])
    df = df[pd.notnull(df[
        redcap_common.SESSION_DATE])]  # remove rows for non-attended sessions

    if args.duration:
        df = redcap_common.prepare_age_calc(df)
        dx_vars = {'dx_date': 'db_dx_date', 'dx_age': 'db_onset_age'}
        df['db_dx_date'] = pd.to_datetime(df['db_dx_date'])
        dx_age_df = df.loc[df[redcap_common.SESSION_NUMBER] == 's1'].apply(
            redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1)
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.calculate_diagnosis_duration,
                                  'db', dx_age_df)
        df = df.drop('session_age', axis=1)

    if args.consecutive:
        df[redcap_common.SESSION_YEAR] = df[redcap_common.SESSION_DATE].apply(
            lambda x: x.year if x else None)
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.get_consecutive_years,
                                  args.consecutive)
        df = df.drop([redcap_common.SESSION_YEAR], axis=1)

    df = redcap_common.rename_common_columns(
        df, RENAMES,
        True)  # rename common columns back to original names pre-flattening
    df = df.set_index([TRACK_STUDY_ID, redcap_common.SESSION_NUMBER])

    if not args.expand:
        non_session_cols = {
            col: 's1_' + col
            for col in df.columns if not re.match(r's\d_', col)
        }
        df = df.rename(columns=non_session_cols)
        df = redcap_common.flatten(
            df)  # always reflatten at end, unless expand flag is set

    if args.transpose:
        df = df.transpose()

    # clean up dataframe
    revert_non_session_cols = {
        's1_' + col: col
        for col in non_session_cols.keys()
    }
    df = df.rename(columns=revert_non_session_cols)

    if fields:
        drop_fields = fields if not args.expand else DURATION_FIELDS  # if leaving expanded, then the columns we brought in don't match the current columns
        df = redcap_common.cleanup_api_merge(df, drop_fields)

    redcap_common.write_results_and_open(df, args.output_file)
def format_wolfram_data():
    # set up expected arguments and associated help text
    parser = GooeyParser(
        description='Formats Wolfram data from REDCap csv export')
    required = parser.add_argument_group('Required Arguments',
                                         gooey_options={'columns': 1})
    required.add_argument('--input_file',
                          required=True,
                          widget='FileChooser',
                          help='REDCap export file')
    # required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in')

    optional = parser.add_argument_group('Optional Arguments',
                                         gooey_options={'columns': 1})
    optional.add_argument(
        '-c',
        '--consecutive',
        type=int,
        metavar='num_consecutive_years',
        help=
        'Limit results to particpants with data for a number of consecutive years'
    )
    optional.add_argument(
        '-d',
        '--duration',
        nargs='*',
        dest='dx_types',
        widget='Listbox',
        default=None,
        choices=ALL_DX_TYPES,
        help='Calculate diagnosis duration for specified diagnosis types')
    # optional.add_argument('--duration-type', dest='duration_type', default='clinic date', choices=['clinic date','MRI date','MRI date if available, otherwise clinic date ("mri_or_clinic")'], help='Visit date to use when calculating dx durations')
    optional.add_argument(
        '--duration-type',
        dest='duration_type',
        default='clinic date',
        choices=['clinic date', 'MRI date'],
        help='Visit date to use when calculating dx durations')
    optional.add_argument(
        '--drop_non_mri',
        action='store_true',
        help='Drop all sessions that do not have an "mri_date" entry.')
    optional.add_argument(
        '--old-db',
        action='store_true',
        help='whether data was sourced from old Wolfram database')
    optional.add_argument(
        '--api_token',
        widget='PasswordField',
        help=
        'REDCap API token (if not specified, will not pull anything from REDCap)'
    )

    variable_options = parser.add_argument_group(
        'Variable options',
        'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export',
        gooey_options={
            'columns': 1,
            'show_border': True
        })
    variable_options.add_argument(
        '--all',
        nargs='+',
        default=None,
        help=
        'All specified data points required for participant to be included in result'
    )
    variable_options.add_argument(
        '--any',
        nargs='+',
        default=None,
        help=
        'At least one specified data point required for participant to be included in result'
    )

    format_options = parser.add_argument_group('Formatting options',
                                               gooey_options={
                                                   'columns': 2,
                                                   'show_border': True
                                               })
    format_options.add_argument(
        '-f',
        '--flatten',
        action='store_true',
        help='Arrange all session data in single row for participant')
    format_options.add_argument(
        '--flatten_by',
        default='session number',
        choices=['session number', 'clinic year'],
        help='Flatten data by session number or clinic year')
    format_options.add_argument('-t',
                                '--transpose',
                                action='store_true',
                                help='Transpose the data')
    format_options.add_argument(
        '-s',
        '--sort_by',
        default='variable',
        choices=['variable', 'session'],
        help='Sort flattened data by session or variable')

    args = parser.parse_args()

    dur_label = ''
    flatten_label = ''
    mri_label = ''

    if not args.old_db:
        print(
            '### "old_db" not checked, only pulling data from the "new" database ###'
        )

    if not args.input_file.endswith('.csv'):
        parser.error('Input file must be of type csv')

    # create dataframe from REDCap data
    df = redcap_common.create_df(args.input_file)
    df = df[df[WFS_STUDY_ID].str.contains(
        r'WOLF_\d{4}_.+')]  # remove Test and Wolf_AN rows
    num_clinic_years = len(
        df['redcap_event_name'].unique()
    ) - 1  # FIXME: should be counting max number of sessions for participants (still may cause error because they might not be consecutive)
    print('### Number of clinic years detected = {} ###'.format(
        num_clinic_years))

    # get number of subjects in dataframe
    num_subjects = len(df[WFS_STUDY_ID].unique())
    print('### Number of subjects detected in {} = {} ###'.format(
        args.input_file, num_subjects))

    # only create API project if actions require it and data needed is not already present, AND if API token is given
    project = None
    # check for fields missing from csv df
    fields = [
        WFS_SESSION_NUMBER, WFS_CLINIC_YEAR
    ] if WFS_SESSION_NUMBER not in df.columns else [
    ]  # always need to get session number if not in data (used to determine which rows to keep)
    if MISSED_SESSION not in df.columns:
        fields.append(
            MISSED_SESSION
        )  # need missed_session var to remove rows for unattended session
    if args.dx_types is not None:
        for dx_type in args.dx_types:
            dx_age_field = get_dx_column(dx_type, 'best_age_calc')
            if dx_age_field not in df.columns:
                fields.append(dx_age_field)
        for non_dx_field in NON_DX_FIELDS_FOR_DURATION:
            if non_dx_field not in df.columns:
                fields.append(non_dx_field)
    if fields:  # missing some fields, go get from REDCap
        print('### need to get some fields from REDCap ###')
        if args.api_token == "":
            raise RuntimeError(
                "Thre are missing fields in the input csv, so we need to get data from REDCap, but no API token is given. Ask Jon about REDCap API access."
            )
        else:
            redcap_project_key = 'itrack' if not args.old_db else 'wolfram'
            project = project if project else redcap_common.get_redcap_project(
                redcap_project_key, args.api_token)
            df = redcap_common.merge_api_data(
                df, project, fields, [WFS_STUDY_ID, 'redcap_event_name'])

    # rename common columns after api merge to ensure column names match up
    df = redcap_common.rename_common_columns(df, RENAMES, False)

    if args.consecutive is not None and args.consecutive not in range(
            2, num_clinic_years + 1):
        parser.error(
            'Consecutive years must be greater than 1 and cannot exceed number of clinic years ({})'
            .format(num_clinic_years))

    df.loc[(df['redcap_event_name'] == 'stable_patient_cha_arm_1'),
           [redcap_common.SESSION_NUMBER]] = df.loc[
               (df['redcap_event_name'] == 'stable_patient_cha_arm_1'),
               [redcap_common.SESSION_NUMBER]].fillna(0)
    # remove rows for sessions not attended (will have a flag saying they did not attend)
    df = df[pd.notnull(df[redcap_common.SESSION_NUMBER])]
    df = df[pd.isnull(df[MISSED_SESSION])]
    df[redcap_common.SESSION_NUMBER] = df[redcap_common.SESSION_NUMBER].astype(
        int
    )  # once NANs are gone, we can cast as int (nicer for flatten display)

    # if duration argument specified, calculate diagnosis duration for types specified or all (if none specified)
    if args.dx_types is not None:  # explicit None check because empty array is valid
        # this puts a 'session_age' field into the df using dob and session_date (where session_date is from clinic_date)
        df = redcap_common.prepare_age_calc(df)
        df = mri_age_calc(df)
        if args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")':
            df['mri_or_clinic_age'] = df.apply(
                lambda row: select_best_age(row), axis=1)
        for dx_type in args.dx_types:
            dx_vars = {'dx_age': get_dx_column(dx_type, 'best_age_calc')}
            # df[dx_vars['dx_date']] = pd.to_datetime(df[dx_vars['dx_date']], errors='coerce')
            dx_age_df = df.loc[df['redcap_event_name'] ==
                               'stable_patient_cha_arm_1'].apply(
                                   redcap_common.get_diagnosis_age,
                                   args=(dx_vars, ),
                                   axis=1)
            if args.duration_type == 'clinic date':
                dur_label = '_clinic_duration'
                dx_type_clinic = '_'.join([dx_type, 'clinic'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration, dx_type_clinic,
                    dx_age_df, 'session_age')
                dx_dur_field = get_dx_column(dx_type, 'clinic_duration')
                df.loc[~(df[dx_dur_field] > 0), dx_dur_field] = np.nan
            elif args.duration_type == 'MRI date':
                dur_label = '_mri_duration'
                dx_type_mri = '_'.join([dx_type, 'mri'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration, dx_type_mri,
                    dx_age_df, 'mri_age')
                dx_mri_dur_field = get_dx_column(dx_type, 'mri_duration')
                df.loc[~(df[dx_mri_dur_field] > 0), dx_mri_dur_field] = np.nan
            elif args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")':
                dur_label = '_mri_or_clinic_duration'
                dx_type_mri_or_clinic = '_'.join([dx_type, 'mri_or_clinic'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration,
                    dx_type_mri_or_clinic, dx_age_df, 'mri_or_clinic_age')
                dx_best_dur_field = get_dx_column(dx_type,
                                                  'mri_or_clinic_duration')
                df.loc[~(df[dx_best_dur_field] > 0),
                       dx_best_dur_field] = np.nan
            else:
                raise Exception(
                    "ERROR: dx_types chosen, but no duration_type chosen")
        # df = df.drop(['session_age', 'redcap_event_name'], axis=1)

    # if varaibles are specified, filter out rows that don't have data for them (if null or non-numeric)
    if args.all:
        df = redcap_common.check_for_all(df, args.all, project, True)
    if args.any:
        df = redcap_common.check_for_any(df, args.any, project, True)

    # remove session data for participants that did not occur in consecutive years
    if args.consecutive:
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.get_consecutive_years,
                                  args.consecutive)

    if df.empty:
        stderr.write(
            'No data to return. Selections have filtered out all rows.')
        exit(1)

    # add clinic_year
    df['clinic_year'] = df.apply(lambda row: get_clinic_year(row), axis=1)

    # rename common columns back to original names
    df = redcap_common.rename_common_columns(df, RENAMES, True)

    # if we have brought in dx info/demographics from the API, remove it after the calculation and rename columns that were suffixed due to merge
    if not fields == [
            WFS_SESSION_NUMBER, WFS_CLINIC_YEAR
    ] and args.api_token:  # don't need to go through deletion logic if only field is session number
        if WFS_SESSION_NUMBER in fields:
            fields.remove(
                WFS_SESSION_NUMBER)  # remove session number from fields
        df = redcap_common.cleanup_api_merge(df, fields)

    # rename session_age to clinic_age
    df = df.rename(columns={"session_age": "clinic_age"})

    # remove dob, clinic date and MRI date
    df = df.drop(['dob'], axis=1, errors="ignore")
    df = df.drop(['clinic_date'], axis=1, errors="ignore")
    df = df.drop(['mri_date'], axis=1, errors="ignore")
    df = df.drop(['redcap_event_name'], axis=1, errors="ignore")

    # drop non-MRI sessions
    if args.drop_non_mri:
        df = df[(df[MRI_AGE] > 0.0) | (df['clinic_year'] == 0)]
        mri_label = '_just_mri'

    # df.to_csv(r'C:\temp\df_before_flatten.csv')

    # puts all sessions/clinic years for a participant on one line (suffixed with year/session)
    if args.flatten:
        # multi-index column for flattening
        if args.flatten_by == 'session number':
            flatten_by_column = 'wolfram_sessionnumber'
            flatten_label = '_flattened_by_session'
            # df.set_index([redcap_common.STUDY_ID, redcap_common.SESSION_NUMBER], inplace=True)
            flatten_group_prefix = 's'
        elif args.flatten_by == 'clinic year':
            flatten_by_column = 'clinic_year'
            flatten_label = '_flattened_by_clinic'
            # df.set_index([redcap_common.STUDY_ID, 'clinic_year'], inplace=True)
            flatten_group_prefix = 'c'
        else:
            raise Exception('ERROR: flatten_by check failed')

        sort = args.sort_by == 'session'
        df = redcap_common.flatten(df, flatten_by_column, sort,
                                   flatten_group_prefix)

    if args.transpose:
        df = df.transpose()

    # df.to_csv(r'C:\temp\df_right_before_save.csv')

    # make output_file name
    output_file = args.input_file.replace(
        '.csv', '{}{}{}.csv'.format(dur_label, flatten_label, mri_label))

    redcap_common.write_results_and_open(df, output_file)