def main(filepath, output_path):
    data = pd.read_csv(filepath)
    data = clean_names(data)

    # test the format and validness of the filepath arguments
    assert os.path.exists(filepath), "The input file path does not exist!"
    assert filepath[
        -4:] == '.csv', "The input file path must point to a csv file!"
    assert output_path[
        -1:] != "/", "The output path should not be ended with '/'!"
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    # clean the column names
    X = data.drop(columns=['quality'])
    y = data[['quality']]

    # split the data and save the splits
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=1)
    X_train.to_csv(output_path + "/X_train.csv", index=False)
    X_test.to_csv(output_path + "/X_test.csv", index=False)
    y_train.to_csv(output_path + "/y_train.csv", index=False)
    y_test.to_csv(output_path + "/y_test.csv", index=False)
Пример #2
0
 def get_table(res):
     stocks = pd.read_html(res.text,
                           attrs={"class": "quotes"},
                           header=0,
                           thousands=",",
                           decimal='.')[0]
     stocks = janitor.clean_names(stocks)
     return stocks
Пример #3
0
def first_column_to_names(tbl):
    """
    First column of pd table to column names and clean names. Return pd table
    """
    tbl = tbl.dropna(how='all')
    tbl.columns = list(range(len(tbl.columns)))
    tbl = tbl.pivot_table(values=1, columns=0, aggfunc='first')
    return janitor.clean_names(tbl)
Пример #4
0
def preprocess(path):
    """
    converts a .csv file and cleans its columns
    path: str
        path to the file
    """
    dataframe = read_csv(path)
    dataframe = clean_names(dataframe)
    return dataframe
Пример #5
0
def test_clean_names_functional(dataframe):
    df = clean_names(dataframe)
    expected_columns = [
        "a",
        "bell_chart",
        "decorated_elephant",
        "animals@#$%^",
        "cities",
    ]
    assert set(df.columns) == set(expected_columns)
Пример #6
0
def test_clean_names_strip_underscores_r(multiindex_dataframe):
    df = clean_names(multiindex_dataframe, strip_underscores='r')

    levels = [['a', 'bell_chart', 'decorated_elephant'],
              ['b', 'normal_distribution', 'r_i_p_rhino']]

    labels = [[1, 0, 2], [1, 0, 2]]

    expected_columns = pd.MultiIndex(levels=levels, labels=labels)
    assert set(df.columns) == set(expected_columns)
Пример #7
0
def test_clean_names_preserve_case_true(multiindex_dataframe):
    df = multiindex_dataframe.rename(columns=lambda x: '_' + x)
    df = clean_names(multiindex_dataframe, preserve_case=True)

    levels = [['a', 'Bell_Chart', 'decorated_elephant'],
              ['b', 'Normal_Distribution', 'r_i_p_rhino_']]

    labels = [[1, 0, 2], [1, 0, 2]]

    expected_columns = pd.MultiIndex(levels=levels, labels=labels)
    assert set(df.columns) == set(expected_columns)
Пример #8
0
def test_clean_names_strip_underscores_r(multiindex_dataframe):
    df = clean_names(multiindex_dataframe, strip_underscores="r")

    levels = [
        ["a", "bell_chart", "decorated_elephant"],
        ["b", "normal_distribution", "r_i_p_rhino"],
    ]

    labels = [[1, 0, 2], [1, 0, 2]]

    expected_columns = pd.MultiIndex(levels=levels, labels=labels)
    assert set(df.columns) == set(expected_columns)
Пример #9
0
def test_clean_names_preserve_case_true(multiindex_dataframe):
    df = multiindex_dataframe.rename(columns=lambda x: "_" + x)
    df = clean_names(multiindex_dataframe, case_type="preserve")

    levels = [
        ["a", "Bell_Chart", "decorated_elephant"],
        ["b", "Normal_Distribution", "r_i_p_rhino_"],
    ]

    labels = [[1, 0, 2], [1, 0, 2]]

    expected_columns = pd.MultiIndex(levels=levels, labels=labels)
    assert set(df.columns) == set(expected_columns)
Пример #10
0
def make_state_agency_dict2():
    tribe = "Tribe"
    in_df = read_csv('data/revenue_vehicle_condition.csv')
    in_df = clean_names(in_df)
    in_df = in_df.query("reporting_module == @tribe").reset_index()
    d1 = (zip(in_df['state'], in_df['agency_name']))

    res = defaultdict(set)
    for i, j in d1:
        res[i].add(j)

    for k, v in res.items():
        v.add(' All ')
        res[k] = sorted(v)

    res = OrderedDict(sorted(res.items()))
    res['All'] = [' All ']

    return res
Пример #11
0
def clean_movies(df_movies: pd.DataFrame):
    # clean data
    # df_movies = pd.read_csv('../data/generated/df_movies.csv')

    print('Removing data with more than 80% holding nans..')
    print('Shape before cleaning:{}'.format(df_movies.shape))
    df_cleaned = df_movies.dropna(axis=1, how='all')
    print('Sum of isNull for all columns: ', df_cleaned.isnull().sum())
    df_cleaned_two = df_cleaned.loc[:,
                                    df_cleaned.isnull().sum() <
                                    0.8 * df_cleaned.shape[
                                        0]]  # ist asks: Which columns have less nans than 80% of the data? And those you have to keep
    print('Shape after cleaning is done: {}'.format(df_cleaned_two.shape))

    print('Columns before renaming: {}', df_cleaned_two.columns)
    df = janitor.clean_names(df_cleaned_two)
    print('Columns after renaming: {}', df.columns)
    df = df.fillna('missing')
    import re
    df['original_air_date'] = df['original_air_date'].apply(
        lambda x: re.findall('\d{4}', x))

    return df
Пример #12
0
                                    axis=1)
    interim_fin_statements.append(df_fin_info_interim)

    # wait! : https://interactivebrokers.github.io/tws-api/fundamentals.html
    time.sleep(10)

# merge everything in one table
fin_statement_info_all = pd.concat(fin_statement_info, axis=0)
coamap_all = pd.concat(coamap, axis=0)
coamap_all = coamap_all.drop_duplicates(ignore_index=True)
coamap_all.set_axis(coamap_all.columns.str.replace(r'@|#', ''),
                    axis='columns',
                    inplace=True)
# annual financial statements
annual_fin_statements_all = pd.concat(annual_fin_statements, axis=0)
annual_fin_statements_all = janitor.clean_names(annual_fin_statements_all)
annual_fin_statements_all.set_axis(
    annual_fin_statements_all.columns.str.replace(r'@|#', ''),
    axis='columns',
    inplace=True)
# interim financial statements
interim_fin_statements_all = pd.concat(interim_fin_statements, axis=0)
interim_fin_statements_all = janitor.clean_names(interim_fin_statements_all)
interim_fin_statements_all.set_axis(
    interim_fin_statements_all.columns.str.replace(r'@|#', ''),
    axis='columns',
    inplace=True)

# add to database
write_to_db(fin_statement_info_all, "odvjet12_stocks", "fundaments_usa_info")
write_to_db(coamap_all, "odvjet12_stocks", "fundaments_usa_coamap")
Пример #13
0
     "selDatum": today,
     "btnSave": "Show",
     "IsItPregledPovijest": "yes",
     "rbScope": "svi"
 }
 response = requests_retry_session().post(
     'https://zse.hr/default.aspx?id=26523', data=formData)
 tblByDate = pd.read_html(response.text,
                          attrs={'id': 'dnevna_trgovanja'},
                          thousands=".",
                          decimal=',')[0]
 if len(tblByDate.index) == 0:
     print("There are no trade data for ZSE.")
 else:
     # clean table
     tblByDate = janitor.clean_names(tblByDate)
     tblByDate = tblByDate.rename(columns={
         'ticker': 'symbol',
         'change_%': 'change'
     })
     tblByDate['change'] = tblByDate['change'].str.extract('(.\d+,\d+)')
     tblByDate['change'] = tblByDate['change'].str.replace(',', '.')
     tblByDate.loc[:, 'close':'turnover'] = tblByDate.loc[:, 'close':'turnover']\
         .applymap(lambda x: pd.to_numeric(x, errors='coerce'))
     scrapDate = pd.DataFrame([pd.Timestamp.today()] * len(tblByDate),
                              columns=['date'])
     tblByDate = pd.concat([
         scrapDate.reset_index(drop=True),
         tblByDate.reset_index(drop=True)
     ],
                           axis=1)
Пример #14
0
#%%
# set up
import pandas as pd
from matplotlib import pyplot as plt
from janitor import clean_names
import numpy as np
import seaborn as sns
plt.style.use('seaborn-whitegrid')
#%%
# EDA
# set up
tt_path = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv"
recent_grad = pd.read_csv(tt_path)

recent_grad = clean_names(recent_grad)
recent_grad['major'] = recent_grad['major'].str.title()
recent_grad.head()
#%%
by_major_category = recent_grad \
    .assign(median_weighted = recent_grad['median'] * recent_grad['sample_size'])\
    .groupby('major_category')['total', 'women', 'men', 'median_weighted', 'sample_size'].sum()\
    .assign(share_women = lambda x: x['women']/x['total'],
            median_weight = lambda x: x['median_weighted']/x['total'])

p1 = by_major_category.sort_values(by='total')[['women',
                                                'men']].plot(kind="barh",
                                                             stacked=True)
p1.set_xticks(np.arange(0, 1000000, 500000))
plt.xlabel("")
plt.suptitle("What is the most popular major category", fontsize=10)
plt.title("Contribution by gender", fontsize=16)
Пример #15
0
!pip install -q pyjanitor
import janitor

"""4. Import dataset and clean names of the dataset and Importing CSV files



---

Now we can import our dataset with the read_csv function from pandas, giving it the URL for our CSV file, and assigning the resulting dataframe to Vital_Statistics.raw. We can then overwrite the Vital_Statistics_raw with a version that has all the column names reformatted by using the clean_names function from janitor.

"""

Vital_Statistics_raw = pd.read_csv('https://drive.google.com/uc?export=download&id=1ByQvRMFCcL6ZtynM7SxeczmtkpaaMVgN')
Vital_Statistics_raw = janitor.clean_names(Vital_Statistics_raw)

"""5. Previewing dataframes



---

Now we have a dataframe of our pluto data called Vital_Statistics_raw. In python objects that you create like this dataframe have functions associated with them called "methods" and you can use these like dataframe_name.method_name()


"""

Vital_Statistics_raw.head()

"""6. In addition to step-5
Пример #16
0






########################################################################################################################
########################################################################################################################
########################################################################################################################
### Automatic Data Cleaning 

from janitor import clean_names, remove_empty

# This cleans the column names as well as removes any duplicate rows
df_hospital_2 = clean_names(df_hospital)
df_hospital_2 = remove_empty(df_hospital_2)

df_inpatient_2 = clean_names(df_inpatient)
df_inpatient_2 = remove_empty(df_inpatient_2)

df_outpatient_2 = clean_names(df_outpatient)
df_outpatient_2 = remove_empty(df_outpatient_2)



### Save the cleaned datasets for visualization: 

df_hospital_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_hospital_2.csv', index=False, encoding='utf-8-sig')
df_inpatient_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_inpatient_2.csv', index=False, encoding='utf-8-sig')
df_outpatient_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_outpatient_2.csv', index=False, encoding='utf-8-sig')
Пример #17
0
filtered = (filtered.groupby(['species_other', 'lake']).agg({
    'values': 'sum'
}).rename({
    'values': 'total_production'
}, axis=1).reset_index())

tile_plot = (ggplot(
    filtered, aes(x='lake', y='species_other', fill='total_production')) +
             geom_tile() + labs(x='Lake', y='Species', fill='') +
             scale_fill_gradient2(low="white", high="darkblue") +
             theme(subplots_adjust={'right': 0.8}, figure_size=(12, 8)))

print(tile_plot)

stocked_clean = clean_names(stocked)
stocked_clean

stocked_clean['site'].value_counts()
stocked_clean['st_site'].value_counts()
stocked_clean['species'].value_counts()

hist_year = (ggplot(stocked_clean, aes(x='year')) + geom_histogram())

print(hist_year)

stocked_clean['stage'].value_counts()
stocked_clean['offset_length'] = stocked_clean['length'] + 1

hist_length = (ggplot(stocked_clean.loc[~stocked_clean['length'].isna()],
                      aes(x='offset_length')) + geom_histogram() +
Пример #18
0
def test_clean_names_functional(dataframe):
    df = clean_names(dataframe)
    expected_columns = ['a', 'bell_chart', 'decorated_elephant']

    assert set(df.columns) == set(expected_columns)
Пример #19
0
def test_incorrect_strip_underscores(multiindex_dataframe):
    with pytest.raises(JanitorError):
        df = clean_names(multiindex_dataframe,
                         strip_underscores="hello")  # noqa: E501, F841
Пример #20
0
def test_incorrect_strip_underscores(multiindex_dataframe):
    with pytest.raises(janitor.errors.JanitorError):
        df = clean_names(multiindex_dataframe, strip_underscores="hello")
Пример #21
0
def main(input, output):

    basicConfig(level=DEBUG)

    # Load data.

    info('Loading data')

    X = pd.read_excel(input)

    debug(f'Result: {X.shape}')

    # Rename columns.

    info('Renaming columns')

    X = jn.clean_names(X, strip_underscores='both')

    # Remove withdrawn patients.

    info('Removing withdrawn patients')

    X = X.query('diagnosis_status != "WITH"')

    debug(f'Result: {X.shape}')

    # Drop null visits.

    info('Dropping null visits')

    X = X.loc[X['visit_name'].notnull()]

    debug(f'Result: {X.shape}')

    # Change subject IDs to integers.

    info('Changing subject IDs to integers')

    X['subject_id'] = X['subject_id'].astype(int)

    # Map visits to visit IDs.

    info('Mapping visit names to IDs')

    X['visit_id'] = X['visit_name'].apply(VISITS.__getitem__)

    debug(f'Result: {X.shape}')

    # Calculate diagnoses.

    info('Calculating diagnoses')

    X = X.set_index(['subject_id', 'visit_id'])

    raw_diagnoses = pd.Series(np.where(X['diagnosis_status'] == 'COR',
                                       X['diagnosis'], X['cor_diagnosis']),
                              index=X.index,
                              name='diagnosis')

    extended_diagnoses = pd.Series(raw_diagnoses.apply(
        EXTENDED_DIAGNOSES.__getitem__),
                                   name='extended_diagnosis')

    base_diagnoses = pd.Series(raw_diagnoses.map(DIAGNOSES.__getitem__),
                               name='diagnosis')

    diagnoses = base_diagnoses.to_frame().join(extended_diagnoses.to_frame())

    debug(f'Result: {diagnoses.shape}')

    # Write output.

    info('Writing output')

    diagnoses.to_csv(output)
Пример #22
0
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import janitor
import pymysql

if __name__ == '__main__':
    # scrap stocks from: https://dev.to/ssbozy/python-requests-with-retries-4p03
    res = get_request("https://zse.hr/default.aspx?id=26486")
    stocks = pd.read_html(res.text,
                          attrs={"class": "dnevna_trgovanja"},
                          header=1,
                          parse_dates=['Listing date'],
                          thousands=".")[0]
    stocks = janitor.clean_names(stocks)

    # scrap additional info for every stock
    bs = BeautifulSoup(res.text, "html.parser")
    links = bs.find_all(href=True)
    links = [tag['href'] for tag in links]
    links = [link for link in links if re.search(r"^\?id.*dionica=.*", link)]
    links = ["https://zse.hr/default.aspx" + link for link in links]
    stockDetails = []
    for link in links:
        print(link)
        r = get_request(link)
        bs = BeautifulSoup(r.text, 'html.parser')
        sector_code = bs.select('span.toolTip a')[0].get_text()
        sector_code = pd.DataFrame({'nacerev': [sector_code]})
        try:
    'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
    'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'WoodDeckSF',
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'YrSold',
    'SaleType', 'SaleCondition', 'Electrical', "HeatingQC", "Fireplaces",
    "FireplaceQu", "BsmtQual", "BsmtFinType1", "BsmtFinType2", 'LotFrontage',
    'LotArea', 'GarageCars', 'OverallCond', 'SalePrice'
]

# + pycharm={"is_executing": false, "name": "#%%\n"}
# Index will be the ID of the house sale
initial_df = pd.read_csv(filepath_or_buffer=file_path,
                         usecols=import_list,
                         index_col=0)

# convert column names to lowercase and replace spaces with underscores
cleaned_df = jn.clean_names(initial_df)

# + pycharm={"is_executing": false, "name": "#%%\n"}
cleaned_df.sample(5)

# remove outliers based on Pre_process draft based upon Cook's distance > 3 times
# mean absolute average
# this step is performed prior to imputation steps
cleaned_df = cleaned_df.drop([1299, 524], axis="rows")

# + pycharm={"is_executing": false, "name": "#%%\n"}
# adjust basement sqft to cap effect of outliers that I have high basement sf but that do not appear to be
# reflected in an increase in price
cleaned_df["totalbsmtsf"] = cleaned_df["totalbsmtsf"].apply(lambda x: 3000
                                                            if x > 3000 else x)