if row['dim_item_id_3'] != '': obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_3'] if row['dim_item_id_4'] != '': obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_4'] if row['dim_item_id_5'] != '': obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_5'] obs_file['dimension_item_label_eng_2'] = obs_file['dim_item_id_2'] # round observations to 1 decimal point =============================----------------------------------------------# obs_file['observation'][1:-1] = obs_file['observation'][1:-1].map(float).map(lambda x: np.round(x, 0)) # Now get rid of .0, whole numbers are fine obs_file['observation'] = obs_file['observation'].astype(str) obs_file['observation'] = obs_file['observation'].map(lambda x: x.replace('.0', '')) # finalise and output =============================----------------------------------------------# obs_file = tf.dismiss(obs_file, ['dim_id_3', 'dim_id_4', 'dim_id_5', 'dim_id_6', 'dim_id_7']) out_filename = 'transform'+load_file[4:] vt.frame_checks(obs_file, out_filename) obs_file.to_csv(out_filename, index=False) # Now run the coparissons against past datasets cp.compare(sys.argv[2], out_filename)
# Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file[ 'time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map( lambda x: x.replace('.0', '')) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map( lambda x: x.replace('.0', '')) # Add the outward/inward tag then get rid of dim 2 obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].astype(str) obs_file['dim_item_id_2'] = obs_file['dim_item_id_2'].astype(str) obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'] + " (" + obs_file[ 'dim_item_id_2'] + ")" obs_file = tf.dismiss(obs_file, ['dim_id_2']) # Sort out the index obs_file.fillna('', inplace=True) obs_file = obs_file.drop('index', 1) # clean nan values obs_file.fillna('', inplace=True) # Clean the whitesapceout of dimension 1 obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].map(str).map( lambda x: x.strip()) obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1'] # Clean out any spill over on to the final line headers = list(obs_file.columns.values)
lambda x: x.replace('OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace( 'AUSTRALASIA & AUSTRALASIA & OCEANIA', 'AUSTRALASIA & OCEANIA')).astype( str) obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3'] # Get rid of any no lookup errorsi obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_3'], ['NoLookupError', 'of which ']) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(str.strip) obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3'] # Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file[ 'time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map( lambda x: x.replace('.0', '')).astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map( lambda x: x.replace('.0', '')).astype(str) # Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop]) obs_file = tf.dismiss(obs_file, ['dim_id_2', 'dim_id_4', 'dim_id_5']) obs_file.fillna('', inplace=True) # Repair gap in dimension numbering obs_file = tf.validateheaders(obs_file) obs_file.to_csv(sys.argv[3] + '.csv', index=False)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('COUNTRIES COUNTRIES', 'COUNTRIES')).astype(str) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('ASIA COUNTRIES', 'ASIA OTHER ASIAN COUNTRIES')).astype(str) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('EUROPE COUNTRIES', 'EUROPE OTHER EUROPEAN COUNTRIES')).astype(str) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('COUNTRIES COUNTRIES', 'COUNTRIES')).astype(str) # TODO - rly, rly crap fix obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('AUSTRALASIA & AUSTRALASIA & OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str) obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3'] # Get rid of any no lookup errorsi obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_3'], ['NoLookupError', 'of which ']) obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(str.strip) obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3'] # Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', '')).astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', '')).astype(str) # Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop]) obs_file = tf.dismiss(obs_file, ['dim_id_2', 'dim_id_4', 'dim_id_5']) obs_file.fillna('', inplace = True) # Repair gap in dimension numbering obs_file = tf.validateheaders(obs_file) obs_file.to_csv(sys.argv[3] + '.csv', index=False)
'AUSTRALASIA & OCEANIA ', 'AUSTRALASIA & OCEANIA')).astype(str) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace( 'CENTRAL & EASTERN', 'CENTRAL & EASTERN EUROPE')).astype(str) obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4'] # Get rid of any no lookup errorsi obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_4'], ['NoLookupError', 'of which ']) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip) obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4'] # Make Category Generic obs_file = lookup.cat_lookup(obs_file, "dim_item_id_1") obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1'] # Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file[ 'time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map( lambda x: x.replace('.0', '')).astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map( lambda x: x.replace('.0', '')).astype(str) # Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop]) obs_file = tf.dismiss(obs_file, ['dim_id_5', 'dim_id_6']) obs_file.fillna('', inplace=True) obs_file.to_csv(sys.argv[3] + '.csv', index=False)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip) + ' ' + obs_file['dim_item_id_5'].map(str.strip) + ' ' + obs_file['dim_item_id_6'].map(str.strip) obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4'] # Manually fix mismatches from the locations closest-above scipr (i.e Europe in USA etc) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('NEAR & MIDDLE EAST', 'NEAR & MIDDLE EAST COUNTRIES')).astype(str) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('AUSTRALASIA & OCEANIA ', 'AUSTRALASIA & OCEANIA')).astype(str) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('CENTRAL & EASTERN', 'CENTRAL & EASTERN EUROPE')).astype(str) obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4'] # Get rid of any no lookup errorsi obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_4'], ['NoLookupError', 'of which ']) obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip) obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4'] # Make Category Generic obs_file = lookup.cat_lookup(obs_file, "dim_item_id_1") obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1'] # Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', '')).astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', '')).astype(str) # Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop]) obs_file = tf.dismiss(obs_file, ['dim_id_5', 'dim_id_6']) obs_file.fillna('', inplace = True) obs_file.to_csv(sys.argv[3] + '.csv', index=False)
# Sortout the mean/median thing, one for every other instance of mean or median flip = 0 for i, row in obs_file.iterrows(): if row['dim_item_id_4'] == 'change': if flip == 0: obs_file.ix[i, 'dim_item_id_4'] = 'change in Median' flip = 1 else: obs_file.ix[i, 'dim_item_id_4'] = 'change in Mean' flip = 0 # Clean and concatenate the columns for Titles 1,2 & 3. obs_file['dim_item_id_2'] = tf.strip_and_join(obs_file, ['dim_item_id_2', 'dim_item_id_3', 'dim_item_id_4']) obs_file['dimension_item_label_eng_2'] = obs_file['dim_item_id_2'] obs_file = tf.dismiss(obs_file, ['dim_id_3', 'dim_id_4']) # Get rid of the number after number of jobs. i.e 'Number of Jobs 1' becomes 'Number of Jobs' obs_file['dim_id_2'] = obs_file['dim_id_2'].str[:-2] obs_file['dimension_label_eng_2'] = obs_file['dim_id_2'] """ CONDITIONALS This is the code that will change depending on the ASHE table number being transformed. """ # ----------------------------------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------------------------------- if numASHE in ['1', '26']:
obs_file = obs_file.set_value(len(obs_file) -1, 'data_marking', count) obs_file['data_marking'] = obs_file['data_marking'].astype(str) obs_file['data_marking'] = obs_file['data_marking'].map(lambda x: x.replace('.0', '')) obs_file['data_marking'] = obs_file['data_marking'].map(lambda x: x.replace('nan', '')) # Strip trialing 0 from time obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str) obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', '')) obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', '')) # Add the outward/inward tag then get rid of dim 2 obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].astype(str) obs_file['dim_item_id_2'] = obs_file['dim_item_id_2'].astype(str) obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'] + " (" + obs_file['dim_item_id_2'] + ")" obs_file = tf.dismiss(obs_file, ['dim_id_2']) # Sort out the index obs_file.fillna('', inplace = True) obs_file = obs_file.drop('index', 1) # clean nan values obs_file.fillna('', inplace=True) # Clean the whitesapceout of dimension 1 obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].map(str).map(lambda x: x.strip()) obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1'] # Clean out any spill over on to the final line headers = list(obs_file.columns.values) for header in headers[2:]: