def get_titanic(col_names=None, onehot=False, only_n_rows=None, seed=None, original=False): path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) if original: return df df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True) if col_names is not None: df = df[col_names] #Fill missing values df["Age"].fillna(int(df["Age"].mean()), inplace=True) df["Embarked"].fillna("S", inplace=True) if only_n_rows and only_n_rows < len(df): df = df.sample(only_n_rows, random_state=seed) if onehot: df['Survived'] = df['Survived'].astype(bool).astype(str) df = pd.get_dummies(df) return fn.transform_dataset(df)
def get_play_store(one_hot=True): path = os.path.dirname(os.path.realpath( __file__)) + "/../../_data/play_store/googleplaystore.csv" df = pd.read_table(path, sep=',') df.columns #filter weird stuff df = df[~(df.Category == "1.9")] #feature engineering def _parse(x): try: f = float(x.split('$')[-1]) except ValueError: f = np.NaN # return int(np.ceil(f)) return f df.Price = df.Price.apply(_parse) df.Price = pd.cut(df.Price, [-np.inf, 0, 1, 2, 3, 5, 10, 20, 50, np.inf], labels=[ '0', '0-1', '1-2', '2-3', '3-5', '5-10', '10-20', '20-50', '50+' ]).astype(str) df = df[df.Genres.isin(df.Genres.value_counts()[:25].index)] # df['Reviews'] = df.Reviews = pd.cut(df['Reviews'].astype(float), [-np.inf, 0, 10, 100, 1000, 1e4, 1e5, 1e6, 1e7, 1e8], labels=[ '0', '1+', '10+', '100+', '1,000+', '10,000+', '100,000+', '1,000,000+', '10,000,000+' ], retbins=False, include_lowest=False).astype(str) df.Rating = pd.cut(df['Rating'].astype(float), [1, 2, 3, 4, 5], labels=['1-2', '2-3', '3-4', '4-5'], retbins=False, include_lowest=False).astype(str) cols = [ 'Category', 'Price', 'Rating', 'Content Rating', 'Reviews', 'Installs' ] # 'Genres' viel identisch mit category # original data value_dict (tabular data!) value_dict = { i: ['discrete', dict(enumerate(df[c].value_counts().index))] for i, c in enumerate(cols) } parametric_types = [ Categorical, Categorical, Categorical, Categorical, Categorical, Categorical, Categorical, Categorical ] df = df[cols].dropna(axis=1, ) #one hot data onehot = pd.get_dummies(df[cols], prefix=cols, prefix_sep=': ') # value_dict_onehot = {i:} parametric_types_onehot = get_feature_types_from_dataset(onehot) one_hot, value_dict_onehot, _ = fn.transform_dataset( onehot, ['discrete'] * len(onehot.columns)) return onehot, value_dict_onehot, parametric_types_onehot
def test_value_dict(): import os import pandas as pd from util import io path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) df = df[["Survived", "Sex", "Age", "Fare", "Pclass"]] df, val_dict, param_types = fn.transform_dataset(df) io.print_pretty_table(df) print(val_dict) print(param_types)
def get_rki_ed_2(): df = get_rki_ed_data(column_names=["aufnahmezeitpunkt_datum", "leitsymptom", "leitsymptom_gruppe", "vitalwerte", "diagnosen"])#, file_names=["epias_of_rki.2018-11.300000.json"]) df["aufnahmezeitpunkt_wochentag"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').weekday()) df["aufnahmezeitpunkt_monat"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').month) data = [] for val in df["aufnahmezeitpunkt_datum"].unique(): day_df = df[df["aufnahmezeitpunkt_datum"] == val] if len(day_df) == 0: continue week_day = day_df["aufnahmezeitpunkt_wochentag"].iloc[0] month = day_df["aufnahmezeitpunkt_monat"].iloc[0] temps = [] heart_rates = [] breath_rates = [] blood_pressures = [] for vitals in day_df["vitalwerte"]: for vital in vitals: if 'blutdruck_systolisch' in vital: blood_pressures.append(vital['blutdruck_systolisch']) if 'atemfrequenz' in vital: breath_rates.append(vital['atemfrequenz']) if 'herzfrequenz' in vital: heart_rates.append(vital['herzfrequenz']) if 'temperatur' in vital: temps.append(vital['temperatur']) temps = np.array(temps, dtype=np.float64) heart_rates = np.array(heart_rates, dtype=np.float64) breath_rates = np.array(breath_rates, dtype=np.float64) blood_pressures = np.array(blood_pressures, dtype=np.float64) if len(temps)==0 or len(heart_rates)==0 or len(breath_rates)==0 or len(blood_pressures) == 0: continue data.append([week_day, month, len(temps), len(heart_rates), len(breath_rates), len(blood_pressures), np.mean(temps), np.mean(heart_rates), np.mean(breath_rates), np.mean(blood_pressures)]) df = pd.DataFrame(data, columns=["wochentag", "monat", "count_temperatur", "count_herzfrequenz", "count_atemfrequenz", "count_blutdruck", "avg_temperatur", "avg_herzfrequenz", "avg_atemfrequenz", "avg_blutdruck", ]) return fn.transform_dataset(df, feature_types=["discrete", "discrete", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric"])
def get_rki_ed_3(): df = get_rki_ed_data(column_names=["aufnahmezeitpunkt_datum", "aufnahmezeitpunkt_stunde", "behandlung_fachabteilung", "geschlecht", "altersklasse", "zuweisungsart"], file_names=["epias_of_rki.2018-11.300000.json"]) def weekday(date_str): week_day = datetime.datetime.strptime(date_str, '%Y-%m-%d').weekday() if week_day == 0: return "Mon" if week_day == 1: return "Tue" if week_day == 2: return "Wed" if week_day == 3: return "Thur" if week_day == 4: return "Fri" if week_day == 5: return "Sat" if week_day == 6: return "Sun" df["aufnahmezeitpunkt_wochentag"] = df["aufnahmezeitpunkt_datum"].apply(weekday) df["aufnahmezeitpunkt_monat"] = df["aufnahmezeitpunkt_datum"].apply(lambda date_str: datetime.datetime.strptime(date_str, '%Y-%m-%d').month) df.drop(columns=["aufnahmezeitpunkt_datum"], inplace=True) return fn.transform_dataset(df)
def mini_titanic(): data = { 'Survived': { 356: True, 255: True, 380: True, 859: False, 886: False, 248: True, 598: False, 372: False, 574: False, 820: True }, 'Embarked': { 356: 'Southampton', 255: 'Cherbourg', 380: 'Cherbourg', 859: 'Cherbourg', 886: 'Southampton', 248: 'Southampton', 598: 'Cherbourg', 372: 'Southampton', 574: 'Southampton', 820: 'Southampton' }, 'Sex': { 356: 'female', 255: 'female', 380: 'female', 859: 'male', 886: 'male', 248: 'male', 598: 'male', 372: 'male', 574: 'male', 820: 'female' } } columns = ['Survived', 'Embarked', 'Sex'] small = pd.DataFrame(data, columns=columns) small = small.sort_values(['Survived', 'Sex', 'Embarked']) print(small.to_latex(index=False)) return fn.transform_dataset(small)
def get_adult_41_items(onehot=False, only_n_rows=None, seed=None): ''' UCI adult dataset. cleaned and in transactional form The age was discretized. numeric columns (except age) were removed. The purpose of this test is to assure that the algorithm can deal with a small 2.2 MB (30k rows) data set reasonably efficiently. https://raw.githubusercontent.com/tommyod/Efficient-Apriori/master/efficient_apriori/tests/adult_data_cleaned.txt :return: ''' path = os.path.dirname(os.path.realpath( __file__)) + "/../../_data/adult/adult_data_transactions.data" if onehot: transaction_encoder = TransactionEncoder() # data = [] # with open(path) as f: # for line in f.readlines(): # data.append(set(line.strip().replace(' ', '').split(','))) # fit = transaction_encoder.fit(data) # one_hot_df = pd.DataFrame(fit.transform(data), columns=fit.columns_) columns = [ 'education', 'marital-status', 'relationship', 'race', 'sex', 'income', 'age' ] tabular = pd.read_table(path, sep=',', names=columns, skipinitialspace=True) df = pd.get_dummies(tabular.astype(str), prefix=None, prefix_sep='_', dtype=np.bool) else: columns = [ 'education', 'marital-status', 'relationship', 'race', 'sex', 'income', 'age' ] df = pd.read_table(path, sep=',', names=columns, skipinitialspace=True) if only_n_rows and only_n_rows < len(df): df = df.sample(only_n_rows, random_state=seed) return fn.transform_dataset(df)
def get_titanic_bins(col_names=None, onehot=False, only_n_rows=None, seed=None): path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) df['NumFamily'] = (df.SibSp + df.Parch).astype(int) df.loc[df.NumFamily >= 3, 'NumFamily'] = '3+' df.NumFamily = df.NumFamily.astype(str) df.Embarked.replace( { 'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton' }, inplace=True) df["Embarked"].fillna("Unknown", inplace=True) df.drop(columns=[ "PassengerId", "Name", "Ticket", "Cabin", 'SibSp', 'Parch', 'Fare' ], inplace=True) if col_names is not None: df = df[col_names] df.Pclass = df.Pclass.astype(str) # Fill missing values df['Age_'] = np.NaN df.loc[df['Age'] < 16, 'Age_'] = 'child' df.loc[df['Age'].between(16, 30), 'Age_'] = 'young-adult' df.loc[df['Age'].between(31, 50), 'Age_'] = 'middle-aged' df.loc[df['Age'].between(50, df.Age.max()), 'Age_'] = 'old' df['Age_'].fillna('Unknown', inplace=True) df = df.drop(columns=['Age']).rename(columns={'Age_': 'Age'}) if only_n_rows and only_n_rows < len(df): df = df.sample(only_n_rows, random_state=seed) if onehot: df['Survived'] = df['Survived'].astype(bool).astype(str) df = pd.get_dummies(df) return fn.transform_dataset(df)
def get_lending( only_n_rows=None, seed=None, onehot=True, original=False, ): ''' https://www.kaggle.com/wendykan/lending-club-loan-data about 2.200.000 rows, ''' # discretizer = KBinsDiscretizer(n_bins=3, encode='onehot-dense', strategy='quantile') with open('../../_data/lending/loan.csv', 'r', encoding='latin-1') as f: used_cols = [ 'loan_amnt', 'loan_status', 'term', 'purpose', 'int_rate', 'grade', 'emp_length', 'home_ownership', 'annual_inc' ] df = pd.read_csv(f, usecols=used_cols) df = _shorten_df(df, only_n_rows, seed=seed) if original: return df df = df[~df.loan_status.isin([ 'Does not meet the credit policy. Status:Charged Off', 'Does not meet the credit policy. Status:Fully Paid' ])] df.loan_status.replace( ['Late (31-120 days)', 'Late (16-30 days)', 'In Grace Period'], 'Late', inplace=True) df.loan_status.replace('Default', 'Charged Off', inplace=True) df.emp_length.replace([str(i) + ' years' for i in range(2, 10)], '1-10 years', inplace=True) df.emp_length.replace(['< 1 year', '1 year'], '<= 1 year', inplace=True) df.grade.replace( { 'A': 'good', 'B': 'good', 'C': 'medium', 'D': 'medium', 'E': 'bad', 'F': 'bad', 'G': 'bad' }, inplace=True) keep = (df.purpose.value_counts()[df.purpose.value_counts().head(10).index] ).index.to_list() df.purpose = df.purpose[df.purpose.isin(keep)] df.dropna(inplace=True) numeric_cols = df.columns[~(df.dtypes == np.object)] # df[numeric_cols] = discretizer.fit_transform(df[numeric_cols]) for c in numeric_cols: # quantiles = np.round(df[c].quantile([0.25, 0.5, 0.75])).astype(int).tolist() # q_labels = [x.format(low=quantiles[0], mid=quantiles[1], high=quantiles[2]) for x in ['0 - {low}', '{low} - {mid}', '{mid} - {high}', '{high} - inf']] quantiles = np.round(df[c].quantile([0.25, 0.5, 0.75])).astype(int).tolist() q_labels = [ x.format(low=quantiles[0], mid=quantiles[1], high=quantiles[2], max=int(df[c].max())) for x in ['0 - {low}', '{low} - {mid}', '{mid} - {high}', '{high} - {max}'] ] df[c] = pd.cut( df[c], bins=[-np.inf] + quantiles + [np.inf], labels=q_labels, ).astype(str) df.emp_length.replace(['']) #remove rare items df = df[~df.home_ownership.isin(['OTHER', 'ANY'])] if onehot: df = pd.get_dummies(df, ) return fn.transform_dataset(df)
# new_df[cols[0]] = str_df[cols[0]].replace(repl_dict[cols[0]]) # new_df[cols[1]] = str_df[cols[1]].replace(repl_dict[cols[1]]) # new_df[cols[2]] = str_df[cols[2]].replace(repl_dict[cols[2]]) # rang = [np.NaN] * len(spn.scope) # # mini spn example x = np.random.choice([1, 2], int(1e4), replace=True, p=[0.3, 0.7]) p = {1: [0.9, 0.1, 0.], 2: [0., 0.9, 0.1]} y = [] for v in x: y.append(np.random.choice([1, 2, 3], 1, replace=True, p=p[v])) y = np.array(y).reshape(-1, ) z = np.random.choice([1, 2], int(1e4), replace=True, p=[0.4, 0.6]) df = pd.DataFrame(dict(zip(['X', 'Y', 'Z'], [x, y, z]))).astype(str) df, vd, pars = fn.transform_dataset(df) spn = spn_handler.load_or_create_spn(df, vd, pars, 'mini_example', 0.4, 0.5, nrows=None, seed=1, force_create=True, clustering='km_rule_clustering') spn = spn.children[1] manspn = ( 0.3 * (Categorical(p=[0.9, 0.1], scope=0) * Categorical(p=[0.55, 0.4, 0.05], scope=1)) + 0.7 * (Categorical(p=[0., 1.], scope=0) * Categorical(p=[0.1, 0.2, 0.7], scope=1)) ) \ * (Categorical(p=[0.4, 0.6], scope=2)) # plot leaves from example
min_instances_slice) #Print some statistics fn.print_statistics(spn) #Example value dict generation path = os.path.dirname( os.path.realpath(__file__)) + "/../../_data/titanic/train.csv" df = pd.read_csv(path) #print data (top 5 rows) io.print_pretty_table(df.head(5)) df = df[["Survived", "Sex", "Age", "Fare", "Pclass"]] df, val_dict, param_types = fn.transform_dataset(df) #print data after transformation (top 5 rows) io.print_pretty_table(df.head(5)) '''' SPN functions ''' #Load synthetic example SPN (very simple SPN) from simple_spn.example import example_spns spn = example_spns.get_gender_spn() #plot spn fn.plot_spn(spn, "sample_spn.pdf", value_dict) #generate samples