예제 #1
0
        if row['dim_item_id_3'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_3']
        if row['dim_item_id_4'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_4']
        if row['dim_item_id_5'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_5']
                
obs_file['dimension_item_label_eng_2'] = obs_file['dim_item_id_2']


# round observations to 1 decimal point =============================----------------------------------------------#
obs_file['observation'][1:-1] = obs_file['observation'][1:-1].map(float).map(lambda x: np.round(x, 0))

# Now get rid of .0, whole numbers are fine
obs_file['observation'] = obs_file['observation'].astype(str)
obs_file['observation'] = obs_file['observation'].map(lambda x: x.replace('.0', ''))
    

# finalise and output =============================----------------------------------------------#

obs_file = tf.dismiss(obs_file, ['dim_id_3', 'dim_id_4', 'dim_id_5', 'dim_id_6', 'dim_id_7'])

out_filename = 'transform'+load_file[4:]
vt.frame_checks(obs_file, out_filename)
obs_file.to_csv(out_filename, index=False)

# Now run the coparissons against past datasets
cp.compare(sys.argv[2], out_filename)


예제 #2
0
# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file[
    'time_dim_item_label_eng'].astype(str)
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(
    lambda x: x.replace('.0', ''))
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(
    lambda x: x.replace('.0', ''))

# Add the outward/inward tag then get rid of dim 2
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].astype(str)
obs_file['dim_item_id_2'] = obs_file['dim_item_id_2'].astype(str)
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'] + " (" + obs_file[
    'dim_item_id_2'] + ")"
obs_file = tf.dismiss(obs_file, ['dim_id_2'])

# Sort out the index
obs_file.fillna('', inplace=True)
obs_file = obs_file.drop('index', 1)

# clean nan values
obs_file.fillna('', inplace=True)

# Clean the whitesapceout of dimension 1
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].map(str).map(
    lambda x: x.strip())
obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1']

# Clean out any spill over on to the final line
headers = list(obs_file.columns.values)
예제 #3
0
        if row['dim_item_id_3'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_3']
        if row['dim_item_id_4'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_4']
        if row['dim_item_id_5'] != '':
                obs_file.ix[i, 'dim_item_id_2'] = obs_file.ix[i, 'dim_item_id_2'] + ' ' + obs_file.ix[i, 'dim_item_id_5']
                
obs_file['dimension_item_label_eng_2'] = obs_file['dim_item_id_2']


# round observations to 1 decimal point =============================----------------------------------------------#
obs_file['observation'][1:-1] = obs_file['observation'][1:-1].map(float).map(lambda x: np.round(x, 0))

# Now get rid of .0, whole numbers are fine
obs_file['observation'] = obs_file['observation'].astype(str)
obs_file['observation'] = obs_file['observation'].map(lambda x: x.replace('.0', ''))


# finalise and output =============================----------------------------------------------#

obs_file = tf.dismiss(obs_file, ['dim_id_3', 'dim_id_4', 'dim_id_5', 'dim_id_6', 'dim_id_7'])

out_filename = 'transform'+load_file[4:]
vt.frame_checks(obs_file, out_filename)
obs_file.to_csv(out_filename, index=False)

# Now run the coparissons against past datasets
cp.compare(sys.argv[2], out_filename)


예제 #4
0
    lambda x: x.replace('OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace(
    'AUSTRALASIA & AUSTRALASIA & OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(
        str)

obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3']

# Get rid of any no lookup errorsi
obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_3'],
                                  ['NoLookupError', 'of which '])
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(str.strip)
obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3']

# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file[
    'time_dim_item_label_eng'].astype(str)
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(
    lambda x: x.replace('.0', '')).astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(
    lambda x: x.replace('.0', '')).astype(str)

# Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop])
obs_file = tf.dismiss(obs_file, ['dim_id_2', 'dim_id_4', 'dim_id_5'])
obs_file.fillna('', inplace=True)

# Repair gap in dimension numbering
obs_file = tf.validateheaders(obs_file)

obs_file.to_csv(sys.argv[3] + '.csv', index=False)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('COUNTRIES COUNTRIES', 'COUNTRIES')).astype(str)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('ASIA COUNTRIES', 'ASIA OTHER ASIAN COUNTRIES')).astype(str)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('EUROPE COUNTRIES', 'EUROPE OTHER EUROPEAN COUNTRIES')).astype(str)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('COUNTRIES COUNTRIES', 'COUNTRIES')).astype(str)

# TODO - rly, rly crap fix
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str)
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(lambda x: x.replace('AUSTRALASIA & AUSTRALASIA & OCEANIA', 'AUSTRALASIA & OCEANIA')).astype(str)

obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3']

# Get rid of any no lookup errorsi
obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_3'], ['NoLookupError', 'of which '])
obs_file['dim_item_id_3'] = obs_file['dim_item_id_3'].map(str.strip)
obs_file['dimension_item_label_eng_3'] = obs_file['dim_item_id_3']

# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str)
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', '')).astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', '')).astype(str)

# Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop])
obs_file = tf.dismiss(obs_file, ['dim_id_2', 'dim_id_4', 'dim_id_5'])
obs_file.fillna('', inplace = True)

# Repair gap in dimension numbering
obs_file = tf.validateheaders(obs_file)

obs_file.to_csv(sys.argv[3] + '.csv', index=False)
    'AUSTRALASIA & OCEANIA ', 'AUSTRALASIA & OCEANIA')).astype(str)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace(
    'CENTRAL & EASTERN', 'CENTRAL & EASTERN EUROPE')).astype(str)
obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4']

# Get rid of any no lookup errorsi
obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_4'],
                                  ['NoLookupError', 'of which '])
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip)
obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4']

# Make Category Generic
obs_file = lookup.cat_lookup(obs_file, "dim_item_id_1")
obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1']

# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file[
    'time_dim_item_label_eng'].astype(str)

obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(
    lambda x: x.replace('.0', '')).astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(
    lambda x: x.replace('.0', '')).astype(str)

# Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop])
obs_file = tf.dismiss(obs_file, ['dim_id_5', 'dim_id_6'])
obs_file.fillna('', inplace=True)

obs_file.to_csv(sys.argv[3] + '.csv', index=False)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip) + ' ' + obs_file['dim_item_id_5'].map(str.strip) + ' ' + obs_file['dim_item_id_6'].map(str.strip)
obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4']

# Manually fix mismatches from the locations closest-above scipr (i.e Europe in USA etc)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('NEAR & MIDDLE EAST', 'NEAR & MIDDLE EAST COUNTRIES')).astype(str)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('AUSTRALASIA & OCEANIA ', 'AUSTRALASIA & OCEANIA')).astype(str)
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(lambda x: x.replace('CENTRAL & EASTERN', 'CENTRAL & EASTERN EUROPE')).astype(str)
obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4']

# Get rid of any no lookup errorsi
obs_file = tf.remove_from_columns(obs_file, ['dim_item_id_4'], ['NoLookupError', 'of which '])
obs_file['dim_item_id_4'] = obs_file['dim_item_id_4'].map(str.strip)
obs_file['dimension_item_label_eng_4'] = obs_file['dim_item_id_4']

# Make Category Generic
obs_file = lookup.cat_lookup(obs_file, "dim_item_id_1")
obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1']

# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str)

obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', '')).astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', '')).astype(str)

# Remove the dimensions we dont need anymore: args = (dataframe, [dimensons to drop])
obs_file = tf.dismiss(obs_file, ['dim_id_5', 'dim_id_6'])
obs_file.fillna('', inplace = True)

obs_file.to_csv(sys.argv[3] + '.csv', index=False)
예제 #8
0
    
    # Sortout the mean/median thing, one for every other instance of mean or median
    flip  = 0
    for i, row in obs_file.iterrows():
        if row['dim_item_id_4'] == 'change':
            if flip == 0:
                obs_file.ix[i, 'dim_item_id_4'] = 'change in Median'
                flip = 1
            else:
                obs_file.ix[i, 'dim_item_id_4'] = 'change in Mean'
                flip = 0
        
    # Clean and concatenate the columns for Titles 1,2 & 3.
    obs_file['dim_item_id_2'] = tf.strip_and_join(obs_file, ['dim_item_id_2', 'dim_item_id_3', 'dim_item_id_4'])
    obs_file['dimension_item_label_eng_2'] = obs_file['dim_item_id_2']
    obs_file = tf.dismiss(obs_file, ['dim_id_3', 'dim_id_4'])
    
    # Get rid of the number after number of jobs. i.e 'Number of Jobs 1' becomes 'Number of Jobs'
    obs_file['dim_id_2'] = obs_file['dim_id_2'].str[:-2]
    obs_file['dimension_label_eng_2'] = obs_file['dim_id_2']

    
    """
    CONDITIONALS
    This is the code that will change depending on the ASHE table number being transformed.
    """

    # -----------------------------------------------------------------------------------------------------------------------------------------------
    # -----------------------------------------------------------------------------------------------------------------------------------------------

    if numASHE in ['1', '26']:
obs_file = obs_file.set_value(len(obs_file) -1, 'data_marking', count)
obs_file['data_marking'] = obs_file['data_marking'].astype(str)
obs_file['data_marking'] = obs_file['data_marking'].map(lambda x: x.replace('.0', ''))
obs_file['data_marking'] = obs_file['data_marking'].map(lambda x: x.replace('nan', ''))

# Strip trialing 0 from time
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].astype(str)
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].astype(str)
obs_file['time_dim_item_id'] = obs_file['time_dim_item_id'].map(lambda x: x.replace('.0', ''))
obs_file['time_dim_item_label_eng'] = obs_file['time_dim_item_label_eng'].map(lambda x: x.replace('.0', ''))

# Add the outward/inward tag then get rid of dim 2
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].astype(str)
obs_file['dim_item_id_2'] = obs_file['dim_item_id_2'].astype(str)
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'] + " (" + obs_file['dim_item_id_2'] + ")"
obs_file = tf.dismiss(obs_file, ['dim_id_2'])

# Sort out the index
obs_file.fillna('', inplace = True)
obs_file = obs_file.drop('index', 1)

# clean nan values
obs_file.fillna('', inplace=True)

# Clean the whitesapceout of dimension 1
obs_file['dim_item_id_1'] = obs_file['dim_item_id_1'].map(str).map(lambda x: x.strip())
obs_file['dimension_item_label_eng_1'] = obs_file['dim_item_id_1']

# Clean out any spill over on to the final line
headers = list(obs_file.columns.values)
for header in headers[2:]: