def main(opt): output_path = 'prov_results' # Specify where to save the processed files as savepath savepath = os.path.join(output_path, 'Testing') df = pd.DataFrame( { 'B': ['B2', 'B3', 'B6', 'B7'], 'D': ['D2', 'D3', 'D6', 'D7'], 'F': ['F2', 'F3', 'F6', 'F7'] }, index=[2, 3, 6, 7]) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if not opt: p = pr.Provenance(df, savepath) else: savepath = os.path.join(savepath, 'FP') p = pr_lib.Provenance(df, savepath) tracker = ProvenanceTracker.ProvenanceTracker(df, p) # tracker.df=tracker.df.dropna() df4 = pd.DataFrame( { 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3'], 'C': ['C0', np.nan, 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'] }, index=[0, 1, 2, 3]) tracker.add_second_df(df4) tracker.set_join_op(axis=0, on=None) tracker.df = pd.concat([df, df4], axis=0, sort=False) print(tracker.df)
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'ST_prov') df = pd.read_csv(input_path) #Trade columnds #['T_ID', 'T_DTS', 'T_ST_ID', 'T_TT_ID', 'T_IS_CASH', 'T_S_SYMB', 'T_QTY', 'T_BIDPRICE', 'C_ID', 'T_EXEX_NAME', 'T_TRADE_PRICE', 'T_CHRG', 'T_COMM', 'T_TAX', 'ActionType', 'ActionTS', 'C_TAX_ID', 'C_L_NAME', 'C_F_NAME', 'C_M_NAME', 'C_GNDR', 'C_TIER', 'C_DOB', 'C_ADLINE1', 'C_ADLINE2', 'C_ZIPCODE', 'C_CITY', 'C_STATE_PROV', 'C_CTRY', 'C_CTRY_1', 'C_AREA_1', 'C_LOCAL_1', 'C_EXT_1', 'C_CTRY_2', 'C_AREA_2', 'C_LOCAL_2', 'C_EXT_2', 'C_CTRY_3', 'C_AREA_3', 'C_LOCAL_3', 'C_EXT_3', 'C_EMAIL_1', 'C_EMAIL_2', 'C_LCL_TX_ID', 'C_NAT_TX_ID'] print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Input prov entities created and saved') #SPACE TRANSFORMATION: add column on trade T_BOOL #Check Null Commission df['T_BOOL'] = [0 if s == None else 1 for s in df.T_COMM] print( '[' + time.strftime("%d/%m-%H:%M:%S") + '] Space Transformation done: Check Null Commission, T_BOOL column added' ) #GET PROVENANCE d = p.get_prov_space_transformation(df, ['T_COMM']) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov Space Transformation saved')
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'IG_prov') df = pd.read_csv(input_path) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Input prov entities created and saved') #INSTANCE GENERATION: add one record to dataframe valueMax_comm = df['T_COMM'].max() df = df.append({'T_COMM': valueMax_comm}, ignore_index=True) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Instance Generation done: added one record to dataframe') #GET PROVENANCE d = p.get_prov_instance_generation(df, ['T_COMM']) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov Instance Generation saved')
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'FT_prov') df = pd.read_csv(input_path) #Trade columnds #['T_ID', 'T_DTS', 'T_ST_ID', 'T_TT_ID', 'T_IS_CASH', 'T_S_SYMB', 'T_QTY', 'T_BIDPRICE', 'C_ID', 'T_EXEX_NAME', 'T_TRADE_PRICE', 'T_CHRG', 'T_COMM', 'T_TAX', 'ActionType', 'ActionTS', 'C_TAX_ID', 'C_L_NAME', 'C_F_NAME', 'C_M_NAME', 'C_GNDR', 'C_TIER', 'C_DOB', 'C_ADLINE1', 'C_ADLINE2', 'C_ZIPCODE', 'C_CITY', 'C_STATE_PROV', 'C_CTRY', 'C_CTRY_1', 'C_AREA_1', 'C_LOCAL_1', 'C_EXT_1', 'C_CTRY_2', 'C_AREA_2', 'C_LOCAL_2', 'C_EXT_2', 'C_CTRY_3', 'C_AREA_3', 'C_LOCAL_3', 'C_EXT_3', 'C_EMAIL_1', 'C_EMAIL_2', 'C_LCL_TX_ID', 'C_NAT_TX_ID'] print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Input prov entities created and saved') #FEATURE TRANSFORMATION: correct invalid gender. # Gender is uppercased. Values other than 'M' or 'F' are replaced with 'U' df['C_GNDR'] = df['C_GNDR'].str.upper() df['C_GNDR'] = ['U' if g is not 'F' or g is not 'M' else g for g in df.C_GNDR] print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Feature Transformation done: correct invalid gender entities') #GET PROVENANCE d = p.get_prov_feature_transformation(df, ['C_GNDR']) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov Feature Transformation saved')
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'DM_prov') df = pd.read_csv(input_path) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Input prov entities created and saved') #DIMENSIONALITY REDUCTION: randomly removes one column from df columns = df.columns random_col = randrange(len(columns)-1) to_delete = columns[random_col] df = df.drop([to_delete], axis=1) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Dimensionality Reduction done: ' + to_delete + ' column deleted') #GET PROVENANCE d = p.get_prov_dim_reduction(df) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov Dimensionality Reduction saved')
def main(opt): output_path = 'prov_results' # Specify where to save the processed files as savepath savepath = os.path.join(output_path, 'Join') df = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2', 'K0'], 'key2': ['K0', 'K1', 'K0', 'K1', 'K0'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4'], 'B': ['B0', 'B1', 'B2', 'B3', 'B4'] }) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if not opt: p = pr.Provenance(df, savepath) else: savepath = os.path.join(savepath, 'FP') p = pr_lib.Provenance(df, savepath) tracker=ProvenanceTracker.ProvenanceTracker(df, p) # tracker.df=tracker.df.dropna() right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2', ], 'key2': ['K0', 'K0', 'K0', 'K0'], 'A': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3'], 'C':['B0', 'B1', 'B2', 'B3']}) tracker.add_second_df(right) tracker.set_join_op(axis=None,on=['key1', 'key2']) tracker.df = pd.merge(tracker.df, tracker.second_df, on=['key1', 'key2'], how='right') print(tracker.df)
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'I_prov') df = pd.read_csv(input_path) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Input prov entities created and saved') #IMPUTATION: fill NaN elements of T_COMM column with avg AVG_comm = df['T_COMM'].mean() df['T_COMM'] = df['T_COMM'].fillna(AVG_comm) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Imputation done: fill NaN elements of T_COMM column') #GET PROVENANCE d = p.get_prov_imputation(df, ['T_COMM']) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov Imputation saved')
def main(input_path, opt): # Specify where to save the processed files as savepath output_path = os.path.join('prov_results', os.path.basename(input_path)) savepath = os.path.join(output_path, 'VT_prov') df = pd.read_csv(input_path) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: p = pr_lib.Provenance(df, savepath) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Input prov entities created and saved') #VALUE TRANSFORMATION: remove invalid date of birth [C_DOB]. #DOB < Batch_Date - 100 or DOB > Batch_Date batch_Date_from = '1917-07-07' batch_Date_to = '2017-07-07' df['C_DOB'] = [ g if g >= batch_Date_from and g <= batch_Date_to else np.nan for g in df.C_DOB ] print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Value Transformation done: removed invalid DOB') #GET PROVENANCE d = p.get_prov_value_transformation(df, ['C_DOB']) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov Value Transformation saved')
def main(opt): input_path = '../real_world_pipeline/Datasets/compas.csv' output_path = 'prov_results' # Specify where to save the processed files as savepath savepath = os.path.join(output_path, 'Compas') df = pd.read_csv(input_path, header=0) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: savepath = os.path.join(savepath, 'FP') p = pr_lib.Provenance(df, savepath) tracker = ProvenanceTracker.ProvenanceTracker(df, p) # OPERATION O # select relevant columns tracker.df = tracker.df[[ 'age', 'c_charge_degree', 'race', 'sex', 'priors_count', 'days_b_screening_arrest', 'two_year_recid', 'c_jail_in', 'c_jail_out' ]] #d = p.get_prov_dim_reduction(df) # OPERATION 1 # Remove missing values #tracker.df = tracker.df.dropna() #d = p.get_prov_dim_reduction(df) # OPERATION 2 # Make race binary tracker.df.race = [0 if r != 'Caucasian' else 1 for r in tracker.df.race] # imputation test AVG_comm = tracker.df['days_b_screening_arrest'].mean() tracker.df['days_b_screening_arrest'] = tracker.df[ 'days_b_screening_arrest'].fillna(AVG_comm) #d = p.get_prov_feature_transformation(df, ['race']) # OPERATION 3 # Make two_year_recid the label tracker.df = tracker.df.rename({'two_year_recid': 'label'}, axis=1) # reverse label for consistency with function defs: 1 means no recid (good), 0 means recid (bad) tracker.df.label = [0 if l == 1 else 1 for l in tracker.df.label] #d = p.get_prov_feature_transformation(df, ['label']) # OPERATION 4 # convert jailtime to days tracker.df['jailtime'] = (pd.to_datetime(tracker.df.c_jail_out) - pd.to_datetime(tracker.df.c_jail_in)).dt.days tracker.stop_space_prov(['c_jail_in', 'c_jail_out']) # Get provenance of space transformation #d = p.get_prov_space_transformation(df, ['c_jail_out', 'c_jail_in']) # OPERATION 5 # drop jail in and out dates tracker.df = tracker.df.drop(['c_jail_in', 'c_jail_out'], axis=1) #d = p.get_prov_dim_reduction(df) # OPERATION 6 # M: misconduct, F: felony tracker.df.c_charge_degree = [ 0 if s == 'M' else 1 for s in tracker.df.c_charge_degree ] #d = p.get_prov_feature_transformation(df, ['c_charge_degree']) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov saved')
def main(opt): input_path = '../real_world_pipeline/Datasets/census.csv' filename_ext = os.path.basename(input_path) filename, ext = os.path.splitext(filename_ext) output_path = 'prov_results' # Specify where to save the processed files as savepath savepath = os.path.join(output_path, filename) df = pd.read_csv(input_path) # Assign names to columns names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label'] df.columns = names print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: savepath = os.path.join(savepath, 'FP') p = pr_lib.Provenance(df, savepath) tracker=ProvenanceTracker.ProvenanceTracker(df, p) # OPERATION 0 # Cleanup names from spaces col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'label'] for c in col: tracker.df[c] = tracker.df[c].map(str.strip) # PROVENANCE 0 #d = p.get_prov_feature_transformation(df, col, 'Cleanup names from spaces') # OPERATION 1 # Replace ? character for NaN value tracker.df = tracker.df.replace('?', np.nan) # PROVENANCE 1 #d = p.get_prov_value_transformation(df, df.columns) # OPERATION 2-3 # One-hot encode categorical variables col = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'] for c in col: dummies = [] dummies.append(pd.get_dummies(tracker.df[c])) df_dummies = pd.concat(dummies, axis=1) tracker.df = pd.concat((tracker.df, df_dummies), axis=1) tracker.df = tracker.df.drop([c], axis=1) # PROVENANCE 2-3 #d = p.get_prov_space_transformation(df, [c]) # OPERATION 4 # Assign sex and label binary values 0 and 1 tracker.df.sex = tracker.df.sex.replace('Male', 1) tracker.df.sex = tracker.df.sex.replace('Female', 0) tracker.df.label = tracker.df.label.replace('<=50K', 0) tracker.df.label = tracker.df.label.replace('>50K', 1) # PROVENANCE 4 #col = ['sex', 'label'] #d = p.get_prov_feature_transformation(df, col, 'Assign sex and label binary values 0 and 1') # OPERATION 5 # Drop fnlwgt variable tracker.df = tracker.df.drop(['fnlwgt'], axis=1) # PROVENANCE 5 #d = p.get_prov_dim_reduction(df) print('[' + time.strftime("%d/%m-%H:%M:%S") + '] Prov saved')
def main(opt): input_path = 'real_world_pipeline/Datasets/german.csv' output_path = 'prov_results' # Specify where to save the processed files as savepath savepath = os.path.join(output_path, 'German') df = pd.read_csv(input_path, header=0) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Initialization') # Create a new provenance document if opt: p = pr.Provenance(df, savepath) else: savepath = os.path.join(savepath, 'FP') p = pr_lib.Provenance(df, savepath) #OPERATION 0 # Turn criptic values into interpretable form df = df.replace({'checking': {'A11': 'check_low', 'A12': 'check_mid', 'A13': 'check_high', 'A14': 'check_none'}, 'credit_history': {'A30': 'debt_none', 'A31': 'debt_noneBank', 'A32': 'debt_onSchedule','A33': 'debt_delay', 'A34': 'debt_critical'}, 'purpose': {'A40': 'pur_newCar', 'A41': 'pur_usedCar', 'A42': 'pur_furniture', 'A43': 'pur_tv', 'A44': 'pur_appliance', 'A45': 'pur_repairs', 'A46': 'pur_education', 'A47': 'pur_vacation', 'A48': 'pur_retraining', 'A49': 'pur_business', 'A410': 'pur_other'}, 'savings': {'A61': 'sav_small', 'A62': 'sav_medium', 'A63': 'sav_large', 'A64': 'sav_xlarge', 'A65': 'sav_none'}, 'employment': {'A71': 'emp_unemployed', 'A72': 'emp_lessOne', 'A73': 'emp_lessFour', 'A74': 'emp_lessSeven', 'A75': 'emp_moreSeven'}, 'other_debtors': {'A101': 'debtor_none', 'A102': 'debtor_coApp', 'A103': 'debtor_guarantor'}, 'property': {'A121': 'prop_realEstate', 'A122': 'prop_agreement', 'A123': 'prop_car', 'A124': 'prop_none'}, 'other_inst': {'A141': 'oi_bank', 'A142': 'oi_stores', 'A143': 'oi_none'}, 'housing': {'A151': 'hous_rent', 'A152': 'hous_own', 'A153': 'hous_free'}, 'job': {'A171': 'job_unskilledNR', 'A172': 'job_unskilledR', 'A173': 'job_skilled', 'A174': 'job_highSkill'}, 'phone': {'A191': 0, 'A192': 1}, 'foreigner': {'A201': 1, 'A202': 0}, 'label': {2: 0}}) col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'other_inst', 'housing', 'job', 'phone', 'foreigner', 'label'] d = p.get_prov_feature_transformation(df, col) #OPERATION 1 # More criptic values translating df['status'] = np.where(df.personal_status == 'A91', 'divorced', np.where(df.personal_status == 'A92', 'divorced', np.where(df.personal_status == 'A93', 'single', np.where(df.personal_status == 'A95', 'single', 'married')))) # Translate gender values df['gender'] = np.where(df.personal_status == 'A92', 0, np.where(df.personal_status == 'A95', 0, 1)) d = p.get_prov_space_transformation(df, ['personal_status']) #OPERATION 2 # Drop personal_status column df = df.drop(['personal_status'], axis=1) d = p.get_prov_dim_reduction(df) #OPERATION 3-13 # One-hot encode categorical columns col = ['checking', 'credit_history', 'purpose', 'savings', 'employment', 'other_debtors', 'property', 'other_inst', 'housing', 'job', 'status'] for c in col: dummies = [] dummies.append(pd.get_dummies(df[c])) df_dummies = pd.concat(dummies, axis = 1) df = pd.concat((df, df_dummies), axis = 1) df = df.drop([c], axis = 1) d = p.get_prov_space_transformation(df, [c]) print('[' + time.strftime("%d/%m-%H:%M:%S") +'] Prov saved')