def get_open_payment_df(save=True):
    data_dir = fetch(OPEN_PAYMENTS_CONFIG)
    files = _get_file_paths(data_dir[0])
    df = _process_df(files)
    df['Physician_Specialty'] = df['Physician_Specialty'].astype('category')
    write_df(save, df, data_dir[1], OPEN_PAYMENTS_CONFIG.main_file)
    return df
示例#2
0
def get_federal_election_df(save=True):
    # data
    data_dir = fetch(FEDERAL_ELECTION_CONFIG)
    file = "itcont.txt"
    csv_path = os.path.join(data_dir[0], file)
    # header
    data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG)
    file_header = "indiv_header_file.csv"
    csv_path_header = os.path.join(data_dir_header[0], file_header)

    df_header = pd.read_csv(csv_path_header)
    df = pd.read_csv(csv_path,
                     sep='|',
                     encoding='latin1',
                     header=None,
                     names=df_header.columns)
    # Some donations are negative
    df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs()
    # Predicting the log of the donation
    df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].apply(np.log)
    df = df[df['TRANSACTION_AMT'] > 0]
    df.rename(columns={col: col.lower() for col in df.columns}, inplace=True)
    df['zip_code'] = df['zip_code'].astype(str)
    df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568])
    df['memo_text'] = df['memo_text'].astype('category')
    write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file)
    return df
示例#3
0
def get_traffic_violations_df(save=True):
    data_dir = fetch(TRAFFIC_VIOLATIONS_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)
    df['Year'] = float_to_int(df['Year'], df.index)
    clean = ['Make', 'Model']
    for c in clean:
        arr = []
        for elt in df[c]:
            if elt == 'NONE':
                arr.append(np.nan)
            else:
                arr.append(elt)
        df[c] = pd.Series(arr, dtype=np.object, index=df.index)

    for c in df:
        arr = []
        for elt in df[c]:
            if isinstance(elt, str) and '\n' in elt:
                elt = elt.replace('\n', '')
            arr.append(elt)
        df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)

    df['VehicleType'] = df['VehicleType'].astype('category')
    df['Arrest Type'] = df['Arrest Type'].astype('category')
    df['Race'] = df['Race'].astype('category')
    df['Violation Type'] = df['Violation Type'].astype('category')
    df.rename(
        columns={col: re.sub(' ', '_', col).lower()
                 for col in df.columns},
        inplace=True)
    write_df(save, df, data_dir[1], TRAFFIC_VIOLATIONS_CONFIG.main_file)
    return df
示例#4
0
def get_public_procurement_df(save=True):

    # FIXME df.shape = (565163, 75) != from paper
    # FIXME nb category cae_name = 39623 != from paper
    # FIXME cae_name become str rather than category
    # (openml requirments)
    data_dir = fetch(PUBLIC_PROCUREMENT_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)

    df.loc[df.ID_LOT == 'Zp 2130-64/15', 'ID_LOT'] = np.nan
    df.ID_LOT = df.ID_LOT.astype(float)
    df.loc[df.CRIT_PRICE_WEIGHT == '50 points', 'CRIT_PRICE_WEIGHT'] = np.nan
    df.loc[[("%" in str(price)) for price in df.CRIT_PRICE_WEIGHT.values],
           'CRIT_PRICE_WEIGHT'] = np.nan
    df.CRIT_PRICE_WEIGHT = df.CRIT_PRICE_WEIGHT.astype(float)
    row_typo = []
    for row, id_lot in enumerate(df.ID_LOT_AWARDED):
        try:
            float(id_lot)
        except:
            row_typo.append(row)
    df.loc[row_typo, 'ID_LOT_AWARDED'] = np.nan  # 345 over 565163
    df.ID_LOT_AWARDED = df.ID_LOT_AWARDED.astype(float)
    df.loc[[39165, 39164], 'CONTRACT_NUMBER'] = np.nan
    df.rename(columns={col: col.lower() for col in df.columns}, inplace=True)
    # df['cae_name'] = df['cae_name'].astype('category')
    df['cae_name'] = df['cae_name'].astype(str)
    tronq_cae = [str(x)[:1023] for x in df['cae_name']]
    df['cae_name'] = pd.Series(tronq_cae,
                               dtype=df['cae_name'].dtype,
                               index=df.index)
    write_df(save, df, data_dir[1], PUBLIC_PROCUREMENT_CONFIG.main_file)
    return df
def get_midwest_survey_df(save=True):
    data_dir = fetch(MIDWEST_SURVEY_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, index_col='RespondentID')
    df = merge_columns(df)
    write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file)
    return df
示例#6
0
def get_midwest_survey_df(save=True):
    data_dir = fetch(MIDWEST_SURVEY_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, index_col='RespondentID')
    df = merge_columns(df)
    write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file)
    df.rename(columns={col: 'Location_Census_Region' for
              col in ['Location (Census Region)']}, inplace=True)
    return df
示例#7
0
def get_drug_discovery_df(save=True):
    data_dir = fetch(DRUG_DISCOVERY_CONFIG)
    file = os.listdir(data_dir[0])[1]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, sep='\t', encoding='latin1')
    cat_cols = ['DRG Definition', 'Provider State']
    for c in cat_cols:
        df[c] = df[c].astype('category')
    write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file)
    return df
def get_medical_charge_df(save=True):
    data_dir = fetch(MEDICAL_CHARGE_CONFIG)
    file = glob.glob(os.path.join(data_dir[0], '*.csv'))[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)
    cat_cols = ['DRG Definition', 'Provider State']
    for c in cat_cols:
        df[c] = df[c].astype('category')

    write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file)
    return df
示例#9
0
def get_medical_charge_df(save=True):
    data_dir = fetch(MEDICAL_CHARGE_CONFIG)
    file = os.listdir(data_dir[0])[1]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, sep=',')
    cat_cols = ['DRG Definition', 'Provider State']
    for c in cat_cols:
        df[c] = df[c].astype('category')
    df.rename(columns={col: re.sub(' ', '_', col).lower() for
              col in df.columns}, inplace=True)
    write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file)
    return df
示例#10
0
def get_beer_reviews_df(save=True):
    data_dir = fetch(BEER_REVIEWS_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)
    for c in df:
        arr = []
        for elt in df[c]:
            if isinstance(elt, str) and '\xa0' in elt:
                elt = elt.replace('\xa0', ' ')
            arr.append(elt)
        df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)
    write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file)
    return df
示例#11
0
def get_met_objects_df(save=True):
    data_dir = fetch(MET_OBJECTS_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, encoding='utf-8')
    cat_cols = ['Department', 'Dynasty', 'State']
    clean = [
        'Geography Type', 'State', 'Classification', 'Artist Role',
        'Artist Prefix', 'Artist Display Bio', 'Artist Suffix',
        'Geography Type'
    ]

    period = []
    for c in df:
        arr = []
        for elt in df[c]:
            if isinstance(elt, str) and '\r\n' in elt:
                elt = elt.replace('\r\n', '')
            if isinstance(elt, str) and '\u3000' in elt:
                elt = elt.replace('\u3000', ' ')
            if isinstance(elt, str) and '\x1e' in elt:
                elt = elt.replace('\x1e', '')
            arr.append(elt)
        df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)

    for c in df['Period']:
        if type(c) is str:
            period.append(c)
        else:
            period.append(np.nan)
    df['Period'] = pd.Series(period, dtype=np.object, index=df.index)

    for c in clean:
        tab = []
        for elt in df[c]:
            if elt == '|' or elt == '||' or elt == '(none assigned)':
                tab.append(np.nan)
            else:
                tab.append(elt)
        df[c] = pd.Series(tab, dtype=np.object, index=df.index)

    for c in cat_cols:
        df[c] = df[c].astype('category')
    df.rename(
        columns={col: re.sub(' ', '_', col).lower()
                 for col in df.columns},
        inplace=True)
    write_df(save, df, data_dir[1], MET_OBJECTS_CONFIG.main_file)
    return df
示例#12
0
def get_employee_salaries_df(save=True):
    data_dir = fetch(EMPLOYEE_SALARIES_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)
    df['Year First Hired'] = [
        datetime.datetime.strptime(d, '%m/%d/%Y').year
        for d in df['Date First Hired']
    ]
    df['Gender'] = df['Gender'].astype('category')
    df['Department'] = df['Department'].astype('category')
    df['Department Name'] = df['Department Name'].astype('category')
    df['Assignment Category'] = df['Assignment Category'].astype('category')
    write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file)
    return df
示例#13
0
def get_colleges_df(save=True):
    data_dir = fetch(COLLEGES_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path, sep='\t', encoding='latin1', index_col='UNITID')
    df.drop(["Unnamed: 0"], 1, inplace=True)
    df['State'] = df['State'].astype(str)
    cols = ['Undergrad Size', 'Predominant Degree', 'Average Cost Academic Year', 'Average Cost Program Year',
            'Tuition (Instate)', 'Tuition (Out of state)', 'Spend per student', 'Faculty Salary',
            'Mean Earnings 6 years', 'Median Earnings 6 years', 'Mean Earnings 10 years', 'Median Earnings 10 years']
    df = _clean_cols(cols, df)

    cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP']
    for c in cats:
        df[c] = df[c].astype('category')

    write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file)
    return df
示例#14
0
def get_road_safety_df(save=True):
    data_dir = fetch(ROAD_SAFETY_CONFIG)
    files = _get_file_paths(data_dir[0])
    df = _process_df(files)
    f_to_i = ['1st_Road_Number', '2nd_Road_Number', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
              'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'accyr', 'Engine_Capacity_(CC)_df',
              'Age_of_Vehicle_df']
    str_to_i = ['Vehicle_Reference', 'Vehicle_Reference_df', 'Vehicle_Reference_df_res']
    to_del = ['data missing or out of range', 'none', -1, 'unknown or other', 'not known', 'unclassified', 'unknown',
              'nan']

    for c in df:
        tab = []
        for elt in df[c]:
            if (isinstance(elt, str) and elt.lower() in to_del) or elt in to_del:
                tab.append(np.nan)
            else:
                tab.append(elt)
        df[c] = pd.Series(tab, dtype=np.object, index=df.index)

    for c in f_to_i:
        df[c] = float_to_int(df[c], df.index)

    for c in str_to_i:
        tab = []
        for elt in df[c]:
            if isinstance(elt, str):
                tab.append(int(elt))
            else:
                tab.append(elt)
        df[c] = pd.Series(tab, dtype=np.object, index=df.index)

    for c in df:
        if len(df[c].unique()) == 1 and str(df[c].unique()[0]) == 'nan':
            df.drop([c], 1, inplace=True)

    write_df(save, df, data_dir[1], ROAD_SAFETY_CONFIG.main_file)
    return df
示例#15
0
def get_crime_df(save=True):
    # FIXME dead link :s
    data_dir = fetch(CRIME_DATA_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)

    cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description',
            'Status Description', 'Crime Code Description']
    print(df.columns)
    df['Victim Age'] = float_to_int(df['Victim Age'], df.index)
    df['Premise Code'] = float_to_int(df['Premise Code'], df.index)
    df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index)
    df['Crime Code 1'] = float_to_int(df['Crime Code 1'], df.index)
    df['Crime Code 2'] = float_to_int(df['Crime Code 2'], df.index)
    df['Crime Code 3'] = float_to_int(df['Crime Code 3'], df.index)
    df['Crime Code 4'] = float_to_int(df['Crime Code 4'], df.index)
    for c in cols:
        if df[c].dtype == float:
            df[c] = float_to_int(df[c], df.index)
        df[c] = df[c].astype('category')

    write_df(save, df, data_dir[1], CRIME_DATA_CONFIG.main_file)
    return df