示例#1
0
def parse_args():
    parser = GooeyParser(
        description='Format redcap CBCL export for ASEBA import')
    required = parser.add_argument_group('Required Arguments')
    required.add_argument('--redcap_export',
                          required=True,
                          widget='FileChooser',
                          help='Demographics + CBCL export from REDCap')
    required.add_argument('--study_name',
                          required=True,
                          choices=['NEWT', 'NT', 'other'])
    required.add_argument('--form_type',
                          required=True,
                          choices=['cbcl', 'ycbcl'])
    required.add_argument('--outdir',
                          widget='DirChooser',
                          required=True,
                          help='where to store output zip')

    other = parser.add_argument_group(
        'Other study options (can ignore if using named study)',
        gooey_options={'columns': 1})
    other.add_argument(
        '--varfile',
        widget='FileChooser',
        help=
        'csv file with REDCap to ASEBA mapping (see H:/REDCap Scripts/static/*cbcl_column_map.csv for examples)'
    )
    other.add_argument(
        '--wide',
        action='store_true',
        help=
        'if REDCap export contains multiple sessions per row (vs each session on own line)'
    )

    args = parser.parse_args()

    if args.study_name == 'other' and not args.varfile:
        parser.error('Must supply varfile if not using named study')

    return args
示例#2
0
def parse_args():
    parser = GooeyParser(
        description=
        'Converts data in REDCap format (one row per session) to SPSS format (one row per subject) and vice versa'
    )
    parser.add_argument(
        'input_file',
        widget='FileChooser',
        help=
        'csv file (full path) to be formatted (first column should contain subject ids)'
    )
    parser.add_argument(
        'output_file',
        help=
        'csv file (full path) to store formatted data (if file does not exist, it will be created)'
    )

    args = parser.parse_args()
    if not args.input_file.endswith('.csv') or not args.output_file.endswith(
            '.csv'):
        parser.error('Input and output files must be of type csv')

    return args
示例#3
0
def parse_args():
    parser = GooeyParser(
        description='Format ASEBA score export for REDCap import')

    required = parser.add_argument_group('Required arguments')
    required.add_argument('--aseba_export',
                          required=True,
                          widget='FileChooser',
                          help='Excel scores export from ASEBA')
    required.add_argument('--study_name',
                          required=True,
                          choices=['NEWT', 'NT', 'other'])
    required.add_argument('--form_type',
                          required=True,
                          choices=['cbcl', 'ycbcl'])

    other = parser.add_argument_group(
        'Other study options (can ignore if using named study)',
        gooey_options={'columns': 1})
    other.add_argument(
        '--varfile',
        widget='FileChooser',
        help=
        'csv file with REDCap to ASEBA mapping (see H:/REDCap Scripts/static/*cbcl_column_map.csv for examples)'
    )
    other.add_argument(
        '--wide',
        action='store_true',
        help=
        'if REDCap has multiple sessions per row (vs each session on own line)'
    )
    args = parser.parse_args()

    if args.study_name == 'other' and not args.varfile:
        parser.error('Must supply varfile if not using named study')

    return args
def format_track_data():
    # set up expected arguments and associated help text
    parser = GooeyParser(
        description='Formats TRACK data from REDCap csv export')

    required = parser.add_argument_group('Required Arguments',
                                         gooey_options={'columns': 1})
    required.add_argument('--input_file',
                          required=True,
                          widget='FileChooser',
                          help='REDCap export file')
    required.add_argument('--output_file',
                          required=True,
                          widget='FileChooser',
                          help='CSV file to store formatted data in')
    required.add_argument('--api_password',
                          required=True,
                          widget='PasswordField',
                          help='Password to access API token')

    optional = parser.add_argument_group('Optional Arguments',
                                         gooey_options={'columns': 2})
    optional.add_argument(
        '-c',
        '--consecutive',
        type=int,
        metavar='num_consecutive_years',
        help=
        'Limit results to particpants with data for a number of consecutive years'
    )
    optional.add_argument('-d',
                          '--duration',
                          action='store_true',
                          help='Calculate diabetes diagnosis duration')

    variable_options = parser.add_argument_group(
        'Variable options',
        'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export',
        gooey_options={
            'columns': 1,
            'show_border': True
        })
    variable_options.add_argument(
        '--all',
        nargs='+',
        default=None,
        help=
        'All specified data points required for participant to be included in result'
    )
    variable_options.add_argument(
        '--any',
        nargs='+',
        default=None,
        help=
        'At least one specified data point required for participant to be included in result'
    )

    format_options = parser.add_argument_group('Formatting options',
                                               gooey_options={
                                                   'columns': 2,
                                                   'show_border': True
                                               })
    format_options.add_argument(
        '-e',
        '--expand',
        action='store_true',
        help='Arrange data with one row per subject per session')
    format_options.add_argument('-t',
                                '--transpose',
                                action='store_true',
                                help='Transpose the data')

    args = parser.parse_args()

    if not args.input_file.endswith('.csv') or not args.output_file.endswith(
            '.csv'):
        parser.error('Input and output files must be of type csv')

    # create initial dataframe structure
    df = redcap_common.create_df(args.input_file)
    df = df[df[TRACK_STUDY_ID].str.contains(r'TRACK\d+')]  # remove test rows

    project = None
    if any(arg is not None
           for arg in [args.all, args.any, args.duration, args.consecutive]):
        project = redcap_common.get_redcap_project('track', args.api_token)

    if args.all:
        df = redcap_common.check_for_all(df, args.all, project)

    if args.any:
        df = redcap_common.check_for_any(df, args.any, project)

    fields = None
    if args.duration or args.consecutive:
        fields = redcap_common.get_matching_columns(
            project.field_names, r'\w*(' + '|'.join(DURATION_FIELDS) + ')')
        df = redcap_common.merge_api_data(df, project, fields,
                                          [TRACK_STUDY_ID])

    # expand/rename after api merge to ensure column names match up
    df, non_session_cols = redcap_common.expand(df.set_index(TRACK_STUDY_ID))
    df = redcap_common.rename_common_columns(df, RENAMES, False)

    df[redcap_common.SESSION_DATE] = pd.to_datetime(
        df[redcap_common.SESSION_DATE])
    df = df[pd.notnull(df[
        redcap_common.SESSION_DATE])]  # remove rows for non-attended sessions

    if args.duration:
        df = redcap_common.prepare_age_calc(df)
        dx_vars = {'dx_date': 'db_dx_date', 'dx_age': 'db_onset_age'}
        df['db_dx_date'] = pd.to_datetime(df['db_dx_date'])
        dx_age_df = df.loc[df[redcap_common.SESSION_NUMBER] == 's1'].apply(
            redcap_common.get_diagnosis_age, args=(dx_vars, ), axis=1)
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.calculate_diagnosis_duration,
                                  'db', dx_age_df)
        df = df.drop('session_age', axis=1)

    if args.consecutive:
        df[redcap_common.SESSION_YEAR] = df[redcap_common.SESSION_DATE].apply(
            lambda x: x.year if x else None)
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.get_consecutive_years,
                                  args.consecutive)
        df = df.drop([redcap_common.SESSION_YEAR], axis=1)

    df = redcap_common.rename_common_columns(
        df, RENAMES,
        True)  # rename common columns back to original names pre-flattening
    df = df.set_index([TRACK_STUDY_ID, redcap_common.SESSION_NUMBER])

    if not args.expand:
        non_session_cols = {
            col: 's1_' + col
            for col in df.columns if not re.match(r's\d_', col)
        }
        df = df.rename(columns=non_session_cols)
        df = redcap_common.flatten(
            df)  # always reflatten at end, unless expand flag is set

    if args.transpose:
        df = df.transpose()

    # clean up dataframe
    revert_non_session_cols = {
        's1_' + col: col
        for col in non_session_cols.keys()
    }
    df = df.rename(columns=revert_non_session_cols)

    if fields:
        drop_fields = fields if not args.expand else DURATION_FIELDS  # if leaving expanded, then the columns we brought in don't match the current columns
        df = redcap_common.cleanup_api_merge(df, drop_fields)

    redcap_common.write_results_and_open(df, args.output_file)
def format_wolfram_data():
    # set up expected arguments and associated help text
    parser = GooeyParser(
        description='Formats Wolfram data from REDCap csv export')
    required = parser.add_argument_group('Required Arguments',
                                         gooey_options={'columns': 1})
    required.add_argument('--input_file',
                          required=True,
                          widget='FileChooser',
                          help='REDCap export file')
    # required.add_argument('--output_file', required=True, widget='FileChooser', help='CSV file to store formatted data in')

    optional = parser.add_argument_group('Optional Arguments',
                                         gooey_options={'columns': 1})
    optional.add_argument(
        '-c',
        '--consecutive',
        type=int,
        metavar='num_consecutive_years',
        help=
        'Limit results to particpants with data for a number of consecutive years'
    )
    optional.add_argument(
        '-d',
        '--duration',
        nargs='*',
        dest='dx_types',
        widget='Listbox',
        default=None,
        choices=ALL_DX_TYPES,
        help='Calculate diagnosis duration for specified diagnosis types')
    # optional.add_argument('--duration-type', dest='duration_type', default='clinic date', choices=['clinic date','MRI date','MRI date if available, otherwise clinic date ("mri_or_clinic")'], help='Visit date to use when calculating dx durations')
    optional.add_argument(
        '--duration-type',
        dest='duration_type',
        default='clinic date',
        choices=['clinic date', 'MRI date'],
        help='Visit date to use when calculating dx durations')
    optional.add_argument(
        '--drop_non_mri',
        action='store_true',
        help='Drop all sessions that do not have an "mri_date" entry.')
    optional.add_argument(
        '--old-db',
        action='store_true',
        help='whether data was sourced from old Wolfram database')
    optional.add_argument(
        '--api_token',
        widget='PasswordField',
        help=
        'REDCap API token (if not specified, will not pull anything from REDCap)'
    )

    variable_options = parser.add_argument_group(
        'Variable options',
        'Space-separated lists of data points (category, column prefix, and/or variable) participants must have data for in export',
        gooey_options={
            'columns': 1,
            'show_border': True
        })
    variable_options.add_argument(
        '--all',
        nargs='+',
        default=None,
        help=
        'All specified data points required for participant to be included in result'
    )
    variable_options.add_argument(
        '--any',
        nargs='+',
        default=None,
        help=
        'At least one specified data point required for participant to be included in result'
    )

    format_options = parser.add_argument_group('Formatting options',
                                               gooey_options={
                                                   'columns': 2,
                                                   'show_border': True
                                               })
    format_options.add_argument(
        '-f',
        '--flatten',
        action='store_true',
        help='Arrange all session data in single row for participant')
    format_options.add_argument(
        '--flatten_by',
        default='session number',
        choices=['session number', 'clinic year'],
        help='Flatten data by session number or clinic year')
    format_options.add_argument('-t',
                                '--transpose',
                                action='store_true',
                                help='Transpose the data')
    format_options.add_argument(
        '-s',
        '--sort_by',
        default='variable',
        choices=['variable', 'session'],
        help='Sort flattened data by session or variable')

    args = parser.parse_args()

    dur_label = ''
    flatten_label = ''
    mri_label = ''

    if not args.old_db:
        print(
            '### "old_db" not checked, only pulling data from the "new" database ###'
        )

    if not args.input_file.endswith('.csv'):
        parser.error('Input file must be of type csv')

    # create dataframe from REDCap data
    df = redcap_common.create_df(args.input_file)
    df = df[df[WFS_STUDY_ID].str.contains(
        r'WOLF_\d{4}_.+')]  # remove Test and Wolf_AN rows
    num_clinic_years = len(
        df['redcap_event_name'].unique()
    ) - 1  # FIXME: should be counting max number of sessions for participants (still may cause error because they might not be consecutive)
    print('### Number of clinic years detected = {} ###'.format(
        num_clinic_years))

    # get number of subjects in dataframe
    num_subjects = len(df[WFS_STUDY_ID].unique())
    print('### Number of subjects detected in {} = {} ###'.format(
        args.input_file, num_subjects))

    # only create API project if actions require it and data needed is not already present, AND if API token is given
    project = None
    # check for fields missing from csv df
    fields = [
        WFS_SESSION_NUMBER, WFS_CLINIC_YEAR
    ] if WFS_SESSION_NUMBER not in df.columns else [
    ]  # always need to get session number if not in data (used to determine which rows to keep)
    if MISSED_SESSION not in df.columns:
        fields.append(
            MISSED_SESSION
        )  # need missed_session var to remove rows for unattended session
    if args.dx_types is not None:
        for dx_type in args.dx_types:
            dx_age_field = get_dx_column(dx_type, 'best_age_calc')
            if dx_age_field not in df.columns:
                fields.append(dx_age_field)
        for non_dx_field in NON_DX_FIELDS_FOR_DURATION:
            if non_dx_field not in df.columns:
                fields.append(non_dx_field)
    if fields:  # missing some fields, go get from REDCap
        print('### need to get some fields from REDCap ###')
        if args.api_token == "":
            raise RuntimeError(
                "Thre are missing fields in the input csv, so we need to get data from REDCap, but no API token is given. Ask Jon about REDCap API access."
            )
        else:
            redcap_project_key = 'itrack' if not args.old_db else 'wolfram'
            project = project if project else redcap_common.get_redcap_project(
                redcap_project_key, args.api_token)
            df = redcap_common.merge_api_data(
                df, project, fields, [WFS_STUDY_ID, 'redcap_event_name'])

    # rename common columns after api merge to ensure column names match up
    df = redcap_common.rename_common_columns(df, RENAMES, False)

    if args.consecutive is not None and args.consecutive not in range(
            2, num_clinic_years + 1):
        parser.error(
            'Consecutive years must be greater than 1 and cannot exceed number of clinic years ({})'
            .format(num_clinic_years))

    df.loc[(df['redcap_event_name'] == 'stable_patient_cha_arm_1'),
           [redcap_common.SESSION_NUMBER]] = df.loc[
               (df['redcap_event_name'] == 'stable_patient_cha_arm_1'),
               [redcap_common.SESSION_NUMBER]].fillna(0)
    # remove rows for sessions not attended (will have a flag saying they did not attend)
    df = df[pd.notnull(df[redcap_common.SESSION_NUMBER])]
    df = df[pd.isnull(df[MISSED_SESSION])]
    df[redcap_common.SESSION_NUMBER] = df[redcap_common.SESSION_NUMBER].astype(
        int
    )  # once NANs are gone, we can cast as int (nicer for flatten display)

    # if duration argument specified, calculate diagnosis duration for types specified or all (if none specified)
    if args.dx_types is not None:  # explicit None check because empty array is valid
        # this puts a 'session_age' field into the df using dob and session_date (where session_date is from clinic_date)
        df = redcap_common.prepare_age_calc(df)
        df = mri_age_calc(df)
        if args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")':
            df['mri_or_clinic_age'] = df.apply(
                lambda row: select_best_age(row), axis=1)
        for dx_type in args.dx_types:
            dx_vars = {'dx_age': get_dx_column(dx_type, 'best_age_calc')}
            # df[dx_vars['dx_date']] = pd.to_datetime(df[dx_vars['dx_date']], errors='coerce')
            dx_age_df = df.loc[df['redcap_event_name'] ==
                               'stable_patient_cha_arm_1'].apply(
                                   redcap_common.get_diagnosis_age,
                                   args=(dx_vars, ),
                                   axis=1)
            if args.duration_type == 'clinic date':
                dur_label = '_clinic_duration'
                dx_type_clinic = '_'.join([dx_type, 'clinic'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration, dx_type_clinic,
                    dx_age_df, 'session_age')
                dx_dur_field = get_dx_column(dx_type, 'clinic_duration')
                df.loc[~(df[dx_dur_field] > 0), dx_dur_field] = np.nan
            elif args.duration_type == 'MRI date':
                dur_label = '_mri_duration'
                dx_type_mri = '_'.join([dx_type, 'mri'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration, dx_type_mri,
                    dx_age_df, 'mri_age')
                dx_mri_dur_field = get_dx_column(dx_type, 'mri_duration')
                df.loc[~(df[dx_mri_dur_field] > 0), dx_mri_dur_field] = np.nan
            elif args.duration_type == 'MRI date if available, otherwise clinic date ("mri_or_clinic")':
                dur_label = '_mri_or_clinic_duration'
                dx_type_mri_or_clinic = '_'.join([dx_type, 'mri_or_clinic'])
                df = df.groupby([redcap_common.STUDY_ID]).apply(
                    redcap_common.calculate_diagnosis_duration,
                    dx_type_mri_or_clinic, dx_age_df, 'mri_or_clinic_age')
                dx_best_dur_field = get_dx_column(dx_type,
                                                  'mri_or_clinic_duration')
                df.loc[~(df[dx_best_dur_field] > 0),
                       dx_best_dur_field] = np.nan
            else:
                raise Exception(
                    "ERROR: dx_types chosen, but no duration_type chosen")
        # df = df.drop(['session_age', 'redcap_event_name'], axis=1)

    # if varaibles are specified, filter out rows that don't have data for them (if null or non-numeric)
    if args.all:
        df = redcap_common.check_for_all(df, args.all, project, True)
    if args.any:
        df = redcap_common.check_for_any(df, args.any, project, True)

    # remove session data for participants that did not occur in consecutive years
    if args.consecutive:
        df = df.groupby([redcap_common.STUDY_ID
                         ]).apply(redcap_common.get_consecutive_years,
                                  args.consecutive)

    if df.empty:
        stderr.write(
            'No data to return. Selections have filtered out all rows.')
        exit(1)

    # add clinic_year
    df['clinic_year'] = df.apply(lambda row: get_clinic_year(row), axis=1)

    # rename common columns back to original names
    df = redcap_common.rename_common_columns(df, RENAMES, True)

    # if we have brought in dx info/demographics from the API, remove it after the calculation and rename columns that were suffixed due to merge
    if not fields == [
            WFS_SESSION_NUMBER, WFS_CLINIC_YEAR
    ] and args.api_token:  # don't need to go through deletion logic if only field is session number
        if WFS_SESSION_NUMBER in fields:
            fields.remove(
                WFS_SESSION_NUMBER)  # remove session number from fields
        df = redcap_common.cleanup_api_merge(df, fields)

    # rename session_age to clinic_age
    df = df.rename(columns={"session_age": "clinic_age"})

    # remove dob, clinic date and MRI date
    df = df.drop(['dob'], axis=1, errors="ignore")
    df = df.drop(['clinic_date'], axis=1, errors="ignore")
    df = df.drop(['mri_date'], axis=1, errors="ignore")
    df = df.drop(['redcap_event_name'], axis=1, errors="ignore")

    # drop non-MRI sessions
    if args.drop_non_mri:
        df = df[(df[MRI_AGE] > 0.0) | (df['clinic_year'] == 0)]
        mri_label = '_just_mri'

    # df.to_csv(r'C:\temp\df_before_flatten.csv')

    # puts all sessions/clinic years for a participant on one line (suffixed with year/session)
    if args.flatten:
        # multi-index column for flattening
        if args.flatten_by == 'session number':
            flatten_by_column = 'wolfram_sessionnumber'
            flatten_label = '_flattened_by_session'
            # df.set_index([redcap_common.STUDY_ID, redcap_common.SESSION_NUMBER], inplace=True)
            flatten_group_prefix = 's'
        elif args.flatten_by == 'clinic year':
            flatten_by_column = 'clinic_year'
            flatten_label = '_flattened_by_clinic'
            # df.set_index([redcap_common.STUDY_ID, 'clinic_year'], inplace=True)
            flatten_group_prefix = 'c'
        else:
            raise Exception('ERROR: flatten_by check failed')

        sort = args.sort_by == 'session'
        df = redcap_common.flatten(df, flatten_by_column, sort,
                                   flatten_group_prefix)

    if args.transpose:
        df = df.transpose()

    # df.to_csv(r'C:\temp\df_right_before_save.csv')

    # make output_file name
    output_file = args.input_file.replace(
        '.csv', '{}{}{}.csv'.format(dur_label, flatten_label, mri_label))

    redcap_common.write_results_and_open(df, output_file)
def dot_dbs_import():
    parser = GooeyParser(description='Formats kinematics data for redcap import')
    required_group = parser.add_argument_group('Required Arguments', gooey_options={'columns': 1})
    required_group.add_argument('--folder', widget='DirChooser', required=True, help='Folder containing subject directories to be processed')
    optional_group = parser.add_argument_group('Optional Arguments', gooey_options={'columns': 1})
    optional_group.add_argument('-s', '--subjects', nargs='+', help='Space-separated list of subject ids to run for (if blank, runs all in folder)')
    args = parser.parse_args()

    if not exists(args.folder):
        parser.error('Specified folder does not exist.')

    script_dir = getcwd()
    chdir(args.folder)

    result = None
    subject_dirs = [ d for d in listdir(getcwd()) if re.match(r'DOTDBS(\d)+$', d) ] if not args.subjects else args.subjects

    if not subject_dirs:
        stderr.write('No subject directories found matching pattern: {}'.format('DOTDBS##'))
        exit(1)

    for i, subject in enumerate(subject_dirs):
        chdir(subject)
        summary_files = [ f for f in listdir(getcwd()) if f.endswith('.xlsx') ]
        if not summary_files:
            print('Subject {} does not have a summary file'.format(subject))
            chdir('..')
            continue

        subject_df = pd.read_excel(summary_files[0], index_col=0, sheet_name=[0,1,2])
        temp_df = None
        for sheet in subject_df.keys():
            print('Processing: {}, block {}'.format(subject, sheet+1))
            subject_df[sheet] = subject_df[sheet][subject_df[sheet].index.notnull()]
            measures = subject_df[sheet].index
            measures = [ ''.join([w[0].lower() for w in s ])
                            for s in [titlecase(m).split() for m in measures ]] # get strings of lowercased first letters for each word in each measure
            measures = [ '_'.join([m[:2], m[2:]]).replace('rt', 'at').replace('ft_pvcov', 'ft_pvcv').replace('b11h', '')
                            for m in measures ] # split var into action type and measure and rename mismatched var names
            subject_df[sheet].index = measures
            subject_df[sheet].insert(0, 'record_id', subject)
            subject_df[sheet].insert(1, 'block', 'Block' + str(sheet+1))
            subject_df[sheet].set_index(['record_id', 'block'], append=True, inplace=True)
            subject_df[sheet] = subject_df[sheet].reorder_levels([1,2,0])
            subject_df[sheet] = subject_df[sheet].dropna(axis=1, how='all')
            subject_df[sheet].columns = [ col.lower() for col in subject_df[sheet].columns ]
            subject_df[sheet] = subject_df[sheet].rename(columns={'left avg': 'Left', 'right avg': 'Right'})
            subject_df[sheet] = subject_df[sheet].drop([col for col in subject_df[sheet].columns if col not in ['Left', 'Right']], axis=1)
            subject_df[sheet] = subject_df[sheet].unstack([1,2]) # .sort_index(1, level=1)
            subject_df[sheet].columns = [ ('_'.join([tup[1], tup[2], tup[0]])).lower() for tup in subject_df[sheet].columns ]
            temp_df = subject_df[sheet] if sheet == 0 else pd.concat([temp_df, subject_df[sheet]], axis=1)

        result = temp_df if i == 0 else pd.concat([result, temp_df], axis=0)
        chdir('..')

    ft_mpv_columns = { col: col.replace('right', 'left_right') for col in result.columns if col.endswith('ft_mpv_right') }
    result.rename(columns=ft_mpv_columns, inplace=True) # hack to fix deviation from regular naming convention ('_left_right' instead of '_right')

    # unused_cols = [ col for col in result.columns if any(var in col for var in UNUSED_VARS) ]
    # result.drop(unused_cols, axis=1, inplace=True)

    for i in range(1,4):
        result['_'.join(['block', str(i), 'kinematics_complete'])] = 1

    print('here')

    redcap_common.write_results_and_open(result, join(script_dir, 'formatted_dotdbs.csv'))
    return