def main(filepath, output_path): data = pd.read_csv(filepath) data = clean_names(data) # test the format and validness of the filepath arguments assert os.path.exists(filepath), "The input file path does not exist!" assert filepath[ -4:] == '.csv', "The input file path must point to a csv file!" assert output_path[ -1:] != "/", "The output path should not be ended with '/'!" if not os.path.exists(output_path): os.makedirs(output_path) # clean the column names X = data.drop(columns=['quality']) y = data[['quality']] # split the data and save the splits X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) X_train.to_csv(output_path + "/X_train.csv", index=False) X_test.to_csv(output_path + "/X_test.csv", index=False) y_train.to_csv(output_path + "/y_train.csv", index=False) y_test.to_csv(output_path + "/y_test.csv", index=False)
def get_table(res): stocks = pd.read_html(res.text, attrs={"class": "quotes"}, header=0, thousands=",", decimal='.')[0] stocks = janitor.clean_names(stocks) return stocks
def first_column_to_names(tbl): """ First column of pd table to column names and clean names. Return pd table """ tbl = tbl.dropna(how='all') tbl.columns = list(range(len(tbl.columns))) tbl = tbl.pivot_table(values=1, columns=0, aggfunc='first') return janitor.clean_names(tbl)
def preprocess(path): """ converts a .csv file and cleans its columns path: str path to the file """ dataframe = read_csv(path) dataframe = clean_names(dataframe) return dataframe
def test_clean_names_functional(dataframe): df = clean_names(dataframe) expected_columns = [ "a", "bell_chart", "decorated_elephant", "animals@#$%^", "cities", ] assert set(df.columns) == set(expected_columns)
def test_clean_names_strip_underscores_r(multiindex_dataframe): df = clean_names(multiindex_dataframe, strip_underscores='r') levels = [['a', 'bell_chart', 'decorated_elephant'], ['b', 'normal_distribution', 'r_i_p_rhino']] labels = [[1, 0, 2], [1, 0, 2]] expected_columns = pd.MultiIndex(levels=levels, labels=labels) assert set(df.columns) == set(expected_columns)
def test_clean_names_preserve_case_true(multiindex_dataframe): df = multiindex_dataframe.rename(columns=lambda x: '_' + x) df = clean_names(multiindex_dataframe, preserve_case=True) levels = [['a', 'Bell_Chart', 'decorated_elephant'], ['b', 'Normal_Distribution', 'r_i_p_rhino_']] labels = [[1, 0, 2], [1, 0, 2]] expected_columns = pd.MultiIndex(levels=levels, labels=labels) assert set(df.columns) == set(expected_columns)
def test_clean_names_strip_underscores_r(multiindex_dataframe): df = clean_names(multiindex_dataframe, strip_underscores="r") levels = [ ["a", "bell_chart", "decorated_elephant"], ["b", "normal_distribution", "r_i_p_rhino"], ] labels = [[1, 0, 2], [1, 0, 2]] expected_columns = pd.MultiIndex(levels=levels, labels=labels) assert set(df.columns) == set(expected_columns)
def test_clean_names_preserve_case_true(multiindex_dataframe): df = multiindex_dataframe.rename(columns=lambda x: "_" + x) df = clean_names(multiindex_dataframe, case_type="preserve") levels = [ ["a", "Bell_Chart", "decorated_elephant"], ["b", "Normal_Distribution", "r_i_p_rhino_"], ] labels = [[1, 0, 2], [1, 0, 2]] expected_columns = pd.MultiIndex(levels=levels, labels=labels) assert set(df.columns) == set(expected_columns)
def make_state_agency_dict2(): tribe = "Tribe" in_df = read_csv('data/revenue_vehicle_condition.csv') in_df = clean_names(in_df) in_df = in_df.query("reporting_module == @tribe").reset_index() d1 = (zip(in_df['state'], in_df['agency_name'])) res = defaultdict(set) for i, j in d1: res[i].add(j) for k, v in res.items(): v.add(' All ') res[k] = sorted(v) res = OrderedDict(sorted(res.items())) res['All'] = [' All '] return res
def clean_movies(df_movies: pd.DataFrame): # clean data # df_movies = pd.read_csv('../data/generated/df_movies.csv') print('Removing data with more than 80% holding nans..') print('Shape before cleaning:{}'.format(df_movies.shape)) df_cleaned = df_movies.dropna(axis=1, how='all') print('Sum of isNull for all columns: ', df_cleaned.isnull().sum()) df_cleaned_two = df_cleaned.loc[:, df_cleaned.isnull().sum() < 0.8 * df_cleaned.shape[ 0]] # ist asks: Which columns have less nans than 80% of the data? And those you have to keep print('Shape after cleaning is done: {}'.format(df_cleaned_two.shape)) print('Columns before renaming: {}', df_cleaned_two.columns) df = janitor.clean_names(df_cleaned_two) print('Columns after renaming: {}', df.columns) df = df.fillna('missing') import re df['original_air_date'] = df['original_air_date'].apply( lambda x: re.findall('\d{4}', x)) return df
axis=1) interim_fin_statements.append(df_fin_info_interim) # wait! : https://interactivebrokers.github.io/tws-api/fundamentals.html time.sleep(10) # merge everything in one table fin_statement_info_all = pd.concat(fin_statement_info, axis=0) coamap_all = pd.concat(coamap, axis=0) coamap_all = coamap_all.drop_duplicates(ignore_index=True) coamap_all.set_axis(coamap_all.columns.str.replace(r'@|#', ''), axis='columns', inplace=True) # annual financial statements annual_fin_statements_all = pd.concat(annual_fin_statements, axis=0) annual_fin_statements_all = janitor.clean_names(annual_fin_statements_all) annual_fin_statements_all.set_axis( annual_fin_statements_all.columns.str.replace(r'@|#', ''), axis='columns', inplace=True) # interim financial statements interim_fin_statements_all = pd.concat(interim_fin_statements, axis=0) interim_fin_statements_all = janitor.clean_names(interim_fin_statements_all) interim_fin_statements_all.set_axis( interim_fin_statements_all.columns.str.replace(r'@|#', ''), axis='columns', inplace=True) # add to database write_to_db(fin_statement_info_all, "odvjet12_stocks", "fundaments_usa_info") write_to_db(coamap_all, "odvjet12_stocks", "fundaments_usa_coamap")
"selDatum": today, "btnSave": "Show", "IsItPregledPovijest": "yes", "rbScope": "svi" } response = requests_retry_session().post( 'https://zse.hr/default.aspx?id=26523', data=formData) tblByDate = pd.read_html(response.text, attrs={'id': 'dnevna_trgovanja'}, thousands=".", decimal=',')[0] if len(tblByDate.index) == 0: print("There are no trade data for ZSE.") else: # clean table tblByDate = janitor.clean_names(tblByDate) tblByDate = tblByDate.rename(columns={ 'ticker': 'symbol', 'change_%': 'change' }) tblByDate['change'] = tblByDate['change'].str.extract('(.\d+,\d+)') tblByDate['change'] = tblByDate['change'].str.replace(',', '.') tblByDate.loc[:, 'close':'turnover'] = tblByDate.loc[:, 'close':'turnover']\ .applymap(lambda x: pd.to_numeric(x, errors='coerce')) scrapDate = pd.DataFrame([pd.Timestamp.today()] * len(tblByDate), columns=['date']) tblByDate = pd.concat([ scrapDate.reset_index(drop=True), tblByDate.reset_index(drop=True) ], axis=1)
#%% # set up import pandas as pd from matplotlib import pyplot as plt from janitor import clean_names import numpy as np import seaborn as sns plt.style.use('seaborn-whitegrid') #%% # EDA # set up tt_path = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv" recent_grad = pd.read_csv(tt_path) recent_grad = clean_names(recent_grad) recent_grad['major'] = recent_grad['major'].str.title() recent_grad.head() #%% by_major_category = recent_grad \ .assign(median_weighted = recent_grad['median'] * recent_grad['sample_size'])\ .groupby('major_category')['total', 'women', 'men', 'median_weighted', 'sample_size'].sum()\ .assign(share_women = lambda x: x['women']/x['total'], median_weight = lambda x: x['median_weighted']/x['total']) p1 = by_major_category.sort_values(by='total')[['women', 'men']].plot(kind="barh", stacked=True) p1.set_xticks(np.arange(0, 1000000, 500000)) plt.xlabel("") plt.suptitle("What is the most popular major category", fontsize=10) plt.title("Contribution by gender", fontsize=16)
!pip install -q pyjanitor import janitor """4. Import dataset and clean names of the dataset and Importing CSV files --- Now we can import our dataset with the read_csv function from pandas, giving it the URL for our CSV file, and assigning the resulting dataframe to Vital_Statistics.raw. We can then overwrite the Vital_Statistics_raw with a version that has all the column names reformatted by using the clean_names function from janitor. """ Vital_Statistics_raw = pd.read_csv('https://drive.google.com/uc?export=download&id=1ByQvRMFCcL6ZtynM7SxeczmtkpaaMVgN') Vital_Statistics_raw = janitor.clean_names(Vital_Statistics_raw) """5. Previewing dataframes --- Now we have a dataframe of our pluto data called Vital_Statistics_raw. In python objects that you create like this dataframe have functions associated with them called "methods" and you can use these like dataframe_name.method_name() """ Vital_Statistics_raw.head() """6. In addition to step-5
######################################################################################################################## ######################################################################################################################## ######################################################################################################################## ### Automatic Data Cleaning from janitor import clean_names, remove_empty # This cleans the column names as well as removes any duplicate rows df_hospital_2 = clean_names(df_hospital) df_hospital_2 = remove_empty(df_hospital_2) df_inpatient_2 = clean_names(df_inpatient) df_inpatient_2 = remove_empty(df_inpatient_2) df_outpatient_2 = clean_names(df_outpatient) df_outpatient_2 = remove_empty(df_outpatient_2) ### Save the cleaned datasets for visualization: df_hospital_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_hospital_2.csv', index=False, encoding='utf-8-sig') df_inpatient_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_inpatient_2.csv', index=False, encoding='utf-8-sig') df_outpatient_2.to_csv('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/df_outpatient_2.csv', index=False, encoding='utf-8-sig')
filtered = (filtered.groupby(['species_other', 'lake']).agg({ 'values': 'sum' }).rename({ 'values': 'total_production' }, axis=1).reset_index()) tile_plot = (ggplot( filtered, aes(x='lake', y='species_other', fill='total_production')) + geom_tile() + labs(x='Lake', y='Species', fill='') + scale_fill_gradient2(low="white", high="darkblue") + theme(subplots_adjust={'right': 0.8}, figure_size=(12, 8))) print(tile_plot) stocked_clean = clean_names(stocked) stocked_clean stocked_clean['site'].value_counts() stocked_clean['st_site'].value_counts() stocked_clean['species'].value_counts() hist_year = (ggplot(stocked_clean, aes(x='year')) + geom_histogram()) print(hist_year) stocked_clean['stage'].value_counts() stocked_clean['offset_length'] = stocked_clean['length'] + 1 hist_length = (ggplot(stocked_clean.loc[~stocked_clean['length'].isna()], aes(x='offset_length')) + geom_histogram() +
def test_clean_names_functional(dataframe): df = clean_names(dataframe) expected_columns = ['a', 'bell_chart', 'decorated_elephant'] assert set(df.columns) == set(expected_columns)
def test_incorrect_strip_underscores(multiindex_dataframe): with pytest.raises(JanitorError): df = clean_names(multiindex_dataframe, strip_underscores="hello") # noqa: E501, F841
def test_incorrect_strip_underscores(multiindex_dataframe): with pytest.raises(janitor.errors.JanitorError): df = clean_names(multiindex_dataframe, strip_underscores="hello")
def main(input, output): basicConfig(level=DEBUG) # Load data. info('Loading data') X = pd.read_excel(input) debug(f'Result: {X.shape}') # Rename columns. info('Renaming columns') X = jn.clean_names(X, strip_underscores='both') # Remove withdrawn patients. info('Removing withdrawn patients') X = X.query('diagnosis_status != "WITH"') debug(f'Result: {X.shape}') # Drop null visits. info('Dropping null visits') X = X.loc[X['visit_name'].notnull()] debug(f'Result: {X.shape}') # Change subject IDs to integers. info('Changing subject IDs to integers') X['subject_id'] = X['subject_id'].astype(int) # Map visits to visit IDs. info('Mapping visit names to IDs') X['visit_id'] = X['visit_name'].apply(VISITS.__getitem__) debug(f'Result: {X.shape}') # Calculate diagnoses. info('Calculating diagnoses') X = X.set_index(['subject_id', 'visit_id']) raw_diagnoses = pd.Series(np.where(X['diagnosis_status'] == 'COR', X['diagnosis'], X['cor_diagnosis']), index=X.index, name='diagnosis') extended_diagnoses = pd.Series(raw_diagnoses.apply( EXTENDED_DIAGNOSES.__getitem__), name='extended_diagnosis') base_diagnoses = pd.Series(raw_diagnoses.map(DIAGNOSES.__getitem__), name='diagnosis') diagnoses = base_diagnoses.to_frame().join(extended_diagnoses.to_frame()) debug(f'Result: {diagnoses.shape}') # Write output. info('Writing output') diagnoses.to_csv(output)
import re from bs4 import BeautifulSoup import pandas as pd import numpy as np import janitor import pymysql if __name__ == '__main__': # scrap stocks from: https://dev.to/ssbozy/python-requests-with-retries-4p03 res = get_request("https://zse.hr/default.aspx?id=26486") stocks = pd.read_html(res.text, attrs={"class": "dnevna_trgovanja"}, header=1, parse_dates=['Listing date'], thousands=".")[0] stocks = janitor.clean_names(stocks) # scrap additional info for every stock bs = BeautifulSoup(res.text, "html.parser") links = bs.find_all(href=True) links = [tag['href'] for tag in links] links = [link for link in links if re.search(r"^\?id.*dionica=.*", link)] links = ["https://zse.hr/default.aspx" + link for link in links] stockDetails = [] for link in links: print(link) r = get_request(link) bs = BeautifulSoup(r.text, 'html.parser') sector_code = bs.select('span.toolTip a')[0].get_text() sector_code = pd.DataFrame({'nacerev': [sector_code]}) try:
'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'YrSold', 'SaleType', 'SaleCondition', 'Electrical', "HeatingQC", "Fireplaces", "FireplaceQu", "BsmtQual", "BsmtFinType1", "BsmtFinType2", 'LotFrontage', 'LotArea', 'GarageCars', 'OverallCond', 'SalePrice' ] # + pycharm={"is_executing": false, "name": "#%%\n"} # Index will be the ID of the house sale initial_df = pd.read_csv(filepath_or_buffer=file_path, usecols=import_list, index_col=0) # convert column names to lowercase and replace spaces with underscores cleaned_df = jn.clean_names(initial_df) # + pycharm={"is_executing": false, "name": "#%%\n"} cleaned_df.sample(5) # remove outliers based on Pre_process draft based upon Cook's distance > 3 times # mean absolute average # this step is performed prior to imputation steps cleaned_df = cleaned_df.drop([1299, 524], axis="rows") # + pycharm={"is_executing": false, "name": "#%%\n"} # adjust basement sqft to cap effect of outliers that I have high basement sf but that do not appear to be # reflected in an increase in price cleaned_df["totalbsmtsf"] = cleaned_df["totalbsmtsf"].apply(lambda x: 3000 if x > 3000 else x)