示例#1
0
def extract_subject_data(df, column_types):
    """
    Extract subject data from the json object in the subject_data column.

    We prefix the new column names with "subject_" to keep them separate from
    the other df columns. The subject data json looks like:
        {<subject_id>: {"key_1": "value_1", "key_2": "value_2", ...}}
    """
    data = (df.subject_data.map(
        json.loads).apply(lambda x: list(x.values())[0]).tolist())
    data = pd.DataFrame(data, index=df.index)
    df = df.drop(['subject_data'], axis=1)

    if 'retired' in data.columns:
        data = data.drop(['retired'], axis=1)

    if 'id' in data.columns:
        data = data.rename(columns={'id': 'external_id'})

    columns = [re.sub(r'\W+', '_', c) for c in data.columns]
    columns = [re.sub(r'^_+|_$', '', c) for c in columns]
    columns = [SUBJECT_PREFIX + c for c in columns]

    columns = {old: new for old, new in zip(data.columns, columns)}
    data = data.rename(columns=columns)

    df = pd.concat([df, data], axis=1)

    # Put the subject columns into the column_types: They're all 'same'
    last = util.last_column_type(column_types)
    for name in data.columns:
        last += 1
        column_types[name] = {'type': 'same', 'order': last, 'name': name}

    return df
示例#2
0
def append_column_type(column_types, key, column_type):
    """Append the column type to the end of the list of columns."""
    if key not in column_types:
        last = util.last_column_type(column_types)
        column_types[key] = {
            'type': column_type,
            'order': last + util.COLUMN_ADD,
            'name': key
        }
示例#3
0
def get_column_types(args, column_types):
    """Append the argument column types to the inferred column types."""
    last = util.last_column_type(column_types)
    if args.column_types:
        for arg in args.column_types:
            for option in arg.split(','):
                name, col_type = option.split(':')
                name = name.strip()
                col_type = col_type.strip()
                if column_types.get(name):
                    order = column_types[name]['order']
                else:
                    last += 1
                    order = last
                column_types[name] = {'type': col_type,
                                      'order': order,
                                      'name': name}
    return column_types
示例#4
0
 def _append_column_type(key, type):
     if key not in column_types:
         last = util.last_column_type(column_types)
         column_types[key] = {'type': type, 'order': last + 1, 'name': key}