예제 #1
0
 def combine_data(self, df):
     df = combining_data(df, self.key, vmc.datadatecol, **self.p)
     df = utl.data_to_type(df, date_col=vmc.datadatecol)
     df = utl.apply_rules(df, self.vm_rules, utl.PRE, **self.p)
     df = combining_data(df, self.key, vmc.datafloatcol, **self.p)
     df = utl.data_to_type(df, vmc.datafloatcol, vmc.datadatecol)
     return df
예제 #2
0
 def strip_dict(tdf, col, data_dict):
     tdf = tdf.copy()
     tdf = utl.data_to_type(tdf, str_col=[dctc.DICT_COL_FNC])
     tdf = tdf[tdf[dctc.DICT_COL_FNC] == 'Strip']
     data_dict = utl.data_to_type(data_dict, str_col=[col])
     for val in tdf[dctc.DICT_COL_VALUE].unique():
         data_dict[col] = data_dict[col].str.replace(val, '')
     return data_dict
예제 #3
0
 def select_translation(tdf, col, data_dict, fnc_type='Select'):
     if dctc.DICT_COL_SEL not in tdf.columns:
         return data_dict
     tdf = tdf.copy()
     tdf = utl.data_to_type(tdf, str_col=[dctc.DICT_COL_FNC])
     select_rows = tdf[dctc.DICT_COL_FNC].str.contains(fnc_type, na=False)
     tdf = tdf[select_rows].copy()
     tdf[dctc.DICT_COL_FNC] = tdf[dctc.DICT_COL_FNC].str.split('::').str[1]
     sel = tdf[[
         dctc.DICT_COL_FNC, dctc.DICT_COL_SEL, dctc.DICT_COL_VALUE,
         dctc.DICT_COL_NVALUE
     ]].to_dict(orient='index')
     for s in sel:
         col2 = sel[s][dctc.DICT_COL_FNC]
         col2_q = sel[s][dctc.DICT_COL_SEL]
         val = sel[s][dctc.DICT_COL_VALUE]
         nval = sel[s][dctc.DICT_COL_NVALUE]
         if col2 not in data_dict.columns:
             continue
         if fnc_type == 'Select':
             data_dict.loc[(data_dict[col2].astype('U') == col2_q) &
                           (data_dict[col] == val), col] = nval
         if fnc_type == 'Set':
             data_dict.loc[data_dict[col2].astype('U') == col2_q,
                           col] = nval
         if fnc_type == 'Append':
             mask = ((data_dict[col2].astype('U') == col2_q) &
                     (data_dict[col].str[-len(nval):] != nval))
             data_dict.loc[mask, col] = (data_dict.loc[mask, col] + nval)
     return data_dict
예제 #4
0
 def clean_df(self, df):
     if df.empty:
         return df
     df = df.drop([jsonmet, jsonseg], axis=1).set_index(colcid)
     ndf = pd.DataFrame(columns=[coldate, colcid])
     ndf = utl.data_to_type(ndf, str_col=[colcid], int_col=[coldate])
     for col in df.columns:
         tdf = df[col].apply(lambda x: self.clean_data(x)).apply(pd.Series)
         tdf = tdf.unstack().reset_index()
         tdf = tdf.rename(columns={0: col, 'level_0': coldate})
         tdf = utl.data_to_type(tdf, str_col=[colcid], int_col=[coldate])
         ndf = pd.merge(ndf, tdf, on=[coldate, colcid], how='outer')
     df = ndf
     df[colspend] /= 1000000
     df[coldate].replace(self.dates, inplace=True)
     return df
예제 #5
0
 def read(self):
     if not os.path.isfile(self.full_file_path):
         logging.info('Creating {}'.format(self.filename))
         df = pd.DataFrame(columns=self.columns, index=None)
         df.to_csv(self.full_file_path, index=False, encoding='utf-8')
     self.df = utl.import_read_csv(self.filename, self.csvpath)
     self.df = utl.data_to_type(self.df, str_col=[self.key])
예제 #6
0
def import_plan_data(key, df, plan_omit_list, **kwargs):
    if df is None or df.empty:
        df = pd.DataFrame(columns=kwargs[vmc.fullplacename] + [vmc.vendorkey])
    df = df.loc[~df[vmc.vendorkey].isin(plan_omit_list)]
    df = df.loc[:, kwargs[vmc.fullplacename]]
    df = full_placement_creation(df, key, dctc.FPN, kwargs[vmc.fullplacename])
    df = df.drop_duplicates()
    dic = dct.Dict(kwargs[vmc.filenamedict])
    df_fpn = pd.DataFrame(df[dctc.FPN])
    er.ErrorReport(df_fpn, dic, None, kwargs[vmc.filenameerror])
    merge_col = list(set(dic.data_dict.columns).intersection(df.columns))
    dic.data_dict = utl.data_to_type(dic.data_dict, str_col=merge_col)
    dic.data_dict = dic.data_dict.merge(df, on=merge_col, how='left')
    dic.apply_functions()
    dic.data_dict = utl.data_to_type(dic.data_dict, date_col=vmc.datadatecol)
    return dic.data_dict
예제 #7
0
 def get_new_values(self, keys_list):
     keys_list = utl.data_to_type(keys_list, str_col=keys_list.columns)
     keys_list = keys_list.merge(pd.DataFrame(self.df[self.key]),
                                 on=self.key, how='left', indicator=True)
     keys_list = keys_list[keys_list['_merge'] == 'left_only']
     keys_list = pd.DataFrame(keys_list[self.key])
     return keys_list
예제 #8
0
 def data_to_df(r):
     df = pd.DataFrame()
     df['Date'] = r.json()['metrics'][0]['columns']
     for x in r.json()['metrics'][0]['dataset']:
         df[x['seriesName']] = x['set']
     df = utl.data_to_type(df, date_col=['Date'])
     df['Date'] = df['Date'].dt.date
     return df
예제 #9
0
 def add_key_values(self, data_dict):
     keys_list = pd.DataFrame(data_dict[self.key]).drop_duplicates()
     keys_list.dropna(subset=[self.key], inplace=True)
     keys_list = self.get_new_values(keys_list)
     keys_list = self.auto_split(keys_list)
     self.df = utl.data_to_type(self.df, str_col=keys_list.columns)
     self.df = self.df.merge(keys_list, how='outer').reset_index(drop=True)
     self.df.dropna(subset=[self.key], inplace=True)
     self.write(self.df)
예제 #10
0
def net_plan_comp(df, p_col=dctc.PFPN, n_cost=vmc.cost, p_cost=dctc.PNC):
    df = utl.data_to_type(df, float_col=[p_cost])
    df[p_cost] = df[p_cost].fillna(0)
    nc_pnc = df[df[dctc.UNC] != True]
    nc_pnc = nc_pnc.groupby(p_col)[p_cost, n_cost].sum()
    nc_pnc = nc_pnc[nc_pnc[p_cost] > 0]
    if p_cost not in nc_pnc.columns:
        nc_pnc[p_cost] = 0
    nc_pnc[DIF_NC_PNC] = nc_pnc[n_cost] - nc_pnc[p_cost]
    nc_pnc = nc_pnc.reset_index()
    nc_pnc.columns = [p_col] + DIF_COL
    df = df.merge(nc_pnc, on=p_col, how='left')
    return df
예제 #11
0
def agency_fees_calculation(df):
    logging.info('Calculating Agency Fees')
    if dctc.AGF not in df.columns:
        logging.warning('Agency Fee Rates not in dict.  '
                        'Update dict and run again to calculate agency fees.')
        return df
    threshold = utl.import_read_csv(agency_fee_file, utl.config_path)
    df = utl.data_to_type(df, float_col=[NCF, dctc.AGF])
    if not df.empty and not threshold.empty:
        threshold = threshold[AGENCY_THRESH].fillna(0).astype(float).values[0]
        threshold = (df[NCF].sum() - threshold) / df[NCF].sum()
        df[dctc.AGF] = df[dctc.AGF] * threshold
    df[AGENCY_FEES] = df[dctc.AGF] * df[NCF]
    return df
예제 #12
0
 def apply_to_dict(self, data_dict):
     if self.key not in data_dict.columns:
         return data_dict
     self.read()
     self.add_key_values(data_dict)
     data_dict = utl.data_to_type(data_dict, str_col=[self.key])
     data_dict = data_dict.merge(self.df, on=self.key, how='left')
     for col in self.dependents:
         col_x = col + '_x'
         col_y = col + '_y'
         if col_y in data_dict.columns:
             data_dict[col] = data_dict[col_y]
             data_dict = data_dict.drop([col_x, col_y], axis=1)
     self.rename_y_columns(data_dict)
     data_dict = self.reorder_columns(data_dict)
     return data_dict
예제 #13
0
def total_cost_calculation(df):
    logging.info('Calculating Total Cost')
    if AGENCY_FEES not in df.columns:
        logging.warning('Agency Fees not in dataframe.  '
                        'Update dict and run again to calculate total cost.')
        return df
    df = utl.data_to_type(df,
                          float_col=[
                              NCF, AGENCY_FEES, vmc.AD_COST,
                              vmc.dcm_service_fee, vmc.REP_COST, vmc.VER_COST
                          ])
    df[TOTAL_COST] = df[NCF] + df[AGENCY_FEES]
    for col in [vmc.AD_COST, vmc.dcm_service_fee, vmc.REP_COST, vmc.VER_COST]:
        if col in df.columns:
            df[TOTAL_COST] += df[col]
    return df
예제 #14
0
 def vm_loop(self):
     logging.info('Initializing Vendor Matrix Loop')
     self.df = pd.DataFrame(columns=[vmc.date, dctc.FPN, dctc.PN, dctc.BM])
     self.sort_vendor_list()
     for vk in self.vl:
         self.tdf = self.vendor_get(vk)
         self.df = self.df.append(self.tdf, ignore_index=True, sort=True)
     self.df = full_placement_creation(self.df, plan_key, dctc.PFPN,
                                       self.vm[vmc.fullplacename][plan_key])
     if not os.listdir(er.csvpath):
         if os.path.isdir(er.csvpath):
             logging.info('All placements defined.  Deleting Error report'
                          ' directory.')
             os.rmdir(er.csvpath)
     self.df = utl.data_to_type(self.df, vmc.datafloatcol, vmc.datadatecol)
     return self.df
예제 #15
0
 def nested_dicts_to_cols(self, nd_col):
     self.df[nd_col] = (
         self.df[nd_col].apply(lambda x: self.convert_dictionary(x)))
     dict_df = self.df[nd_col].apply(pd.Series).fillna(0)
     column_list = dict_df.columns.values.tolist()
     column_list = [
         l for l in column_list if l not in ['action_type', 'value']
     ]
     clean_df = pd.DataFrame()
     if 'action_type' in dict_df.columns:
         column_list += ['action_type']
     for col in column_list:
         dirty_df = dict_df[col].apply(pd.Series).fillna(0)
         if 'action_type' in dirty_df.columns:
             dirty_df = utl.data_to_type(dirty_df, str_col=['action_type'])
             clean_df = self.clean_nested_df(dirty_df, clean_df)
     self.df = pd.concat([clean_df, self.df], axis=1)  # type: pd.DataFrame
     self.df = self.df.drop(nested_dict_col, axis=1)  # type: pd.DataFrame
예제 #16
0
 def vm_parse(self):
     self.vm_df = pd.DataFrame(columns=vmc.datacol)
     self.vm_df = self.read()
     self.vm = self.vm_df.copy()
     self.plan_net_check()
     drop = [
         item for item in self.vm.columns.values.tolist()
         if (item[0] == '|')
     ]
     self.vm = utl.col_removal(self.vm, 'vm', drop)
     self.vm = utl.data_to_type(self.vm, [], vmc.datecol, vmc.barsplitcol)
     self.vl = self.vm[vmc.vendorkey].tolist()
     self.vm = self.vm.set_index(vmc.vendorkey).to_dict()
     for col in vmc.barsplitcol:
         self.vm[col] = ({
             key: list(value.split('|'))
             for key, value in self.vm[col].items()
         })
예제 #17
0
def full_placement_creation(df, key, full_col, full_place_cols):
    logging.debug('Creating Full Placement Name')
    df[full_col] = ''
    df = utl.data_to_type(
        df, str_col=[x[2:] if x[:2] == '::' else x for x in full_place_cols])
    for idx, col in enumerate(full_place_cols):
        if col[:2] == '::':
            col = col[2:]
            df[col] = df[col].str.replace('_', '', regex=True)
        if col not in df:
            logging.warning('{} was not in {}.  It was not included in '
                            'Full Placement Name.  For reference column names'
                            ' are as follows: \n {}'.format(
                                col, key, df.columns.values.tolist()))
            continue
        if idx == 0:
            df[full_col] = df[col]
        else:
            df[full_col] = (df[full_col] + '_' + df[col])
    return df
예제 #18
0
 def clean_nested_df(dirty_df, clean_df):
     values = [x for x in dirty_df.columns if x != 'action_type']
     dirty_df = utl.data_to_type(dirty_df, float_col=values)
     dirty_df = pd.pivot_table(dirty_df,
                               columns='action_type',
                               values=values,
                               index=dirty_df.index,
                               aggfunc='sum',
                               fill_value=0)
     if type(dirty_df.columns) == pd.MultiIndex:
         dirty_df.columns = [
             ' - '.join([str(y) for y in x]) if x[0] != 'value' else x[1]
             for x in dirty_df.columns
         ]
     for col in [x for x in [0.0, 'action_type'] if x in dirty_df.columns]:
         dirty_df = dirty_df.drop(col, axis=1)
     dirty_df = dirty_df.apply(pd.to_numeric)
     clean_df = pd.concat([clean_df, dirty_df], axis=1)
     clean_df = clean_df.groupby(clean_df.columns,
                                 axis=1).sum()  # type: pd.DataFrame
     return clean_df
예제 #19
0
def combining_data(df, key, columns, **kwargs):
    logging.debug('Combining Data.')
    combine_cols = [x for x in columns if kwargs[x] != ['nan']]
    for col in combine_cols:
        if col in df.columns and col not in kwargs[col]:
            df[col] = 0
        for item in kwargs[col]:
            if col == item:
                continue
            if item not in df:
                logging.warning('{} is not in {}.  It was not '
                                'put in {}'.format(item, key, col))
                continue
            if col not in df.columns:
                df[col] = 0
            if col in vmc.datafloatcol:
                df = utl.data_to_type(df, float_col=[col, item])
                df[col] += df[item]
            else:
                df[col] = df[item]
    for col in [x for x in columns if x not in combine_cols]:
        if col in df.columns or col == vmc.date:
            df[col] = 0
    return df
예제 #20
0
def df_single_transform(df, transform):
    if str(transform) == 'nan':
        return df
    transform = transform.split('::')
    transform_type = transform[0]
    if transform_type == 'MixedDateColumn':
        mixed_col = transform[1]
        date_col = transform[2]
        df[date_col] = df[mixed_col]
        df = utl.data_to_type(df, date_col=[date_col])
        df['temp'] = df[date_col]
        df[date_col] = df[date_col].fillna(method='ffill')
        df = df[df['temp'].isnull()].reset_index(drop=True)
        df.drop('temp', axis=1, inplace=True)
    if transform_type == 'Pivot':
        pivot_col = transform[1]
        val_col = transform[2].split('|')
        df = df.fillna(0)
        index_cols = [x for x in df.columns if x not in val_col + [pivot_col]]
        df = pd.pivot_table(df,
                            index=index_cols,
                            columns=[pivot_col],
                            aggfunc='sum')
        if len(val_col) != 1:
            df.columns = df.columns.map('_'.join)
        if type(df.columns) == pd.MultiIndex:
            df.columns = [' - '.join([str(y) for y in x]) for x in df.columns]
        df = df.reset_index()
    if transform_type == 'Merge':
        merge_file = transform[1]
        left_merge = transform[2]
        right_merge = transform[3]
        merge_df = pd.read_csv(merge_file)
        dfs = {left_merge: df, right_merge: merge_df}
        for col in dfs:
            if dfs[col][col].dtype == 'float64':
                dfs[col][col] = dfs[col][col].fillna(0).astype('int')
            dfs[col][col] = dfs[col][col].astype('U')
            dfs[col][col] = dfs[col][col].str.strip('.0')
        filename = 'Merge-{}-{}.csv'.format(left_merge, right_merge)
        err = er.ErrorReport(df,
                             merge_df,
                             None,
                             filename,
                             merge_col=[left_merge, right_merge])
        df = err.merge_df
        df = df.drop('_merge', axis=1)
    if transform_type == 'DateSplit':
        start_date = transform[1]
        end_date = transform[2]
        if len(transform) == 4:
            exempt_col = transform[3].split('|')
        else:
            exempt_col = []
        df = utl.data_to_type(df, date_col=[end_date, start_date])
        df['days'] = (df[end_date] - df[start_date]).dt.days + 1
        n_cols = [
            x for x in df.columns
            if df[x].dtype in ['int64', 'float64'] and x not in exempt_col +
            ['days']
        ]
        df[n_cols] = df[n_cols].div(df['days'], axis=0)
        df = df.loc[df.index.repeat(df['days'])]
        df[start_date] = (df.groupby(level=0)[start_date].transform(
            lambda x: pd.date_range(start=x.iat[0], periods=len(x))))
        df = df.drop('days', axis=1)
        df = df.reset_index(drop=True)  # type: pd.DataFrame
    if transform_type == 'Stack':
        header_col_name = transform[1]
        hold_col_name = transform[2]
        df.columns = [
            df.columns[idx - 1] if 'Unnamed' in x else x
            for idx, x in enumerate(df.columns)
        ]
        hdf = pd.DataFrame(df[hold_col_name])
        ndf = pd.DataFrame()
        for x in set(y for y in df.columns if y != hold_col_name):
            tdf = df[x]
            tdf.columns = tdf.loc[0]
            tdf = tdf.iloc[1:]
            tdf[header_col_name] = x
            ndf = ndf.append(tdf)
        df = pd.concat([ndf, hdf], axis=1, join='inner')
        df = df.reset_index(drop=True)  # type: pd.DataFrame
    if transform_type == 'Melt':
        header_col_name = transform[1]
        variable_cols = transform[2].split('|')
        df = df.melt(id_vars=[x for x in df.columns if x not in variable_cols],
                     value_vars=[x for x in variable_cols if x in df.columns],
                     var_name='{}-variable'.format(header_col_name),
                     value_name='{}-value'.format(header_col_name))
        df = df.reset_index(drop=True)
    if transform_type == 'RawTranslate':
        tc = dct.DictTranslationConfig()
        tc.read(dctc.filename_tran_config)
        df = tc.apply_translation_to_dict(df)
    if transform_type == 'AddColumn':
        col_name = transform[1]
        col_val = transform[2]
        df[col_name] = col_val
    return df
예제 #21
0
 def clean(self):
     self.data_dict = utl.data_to_type(self.data_dict, dctc.floatcol,
                                       dctc.datecol, dctc.strcol)
     if dctc.FPN in self.data_dict.columns:
         self.data_dict = self.data_dict.drop_duplicates(dctc.FPN)
     self.data_dict = self.data_dict.reset_index(drop=True)
예제 #22
0
 def merge_df_cleaning(df, first_row, last_row, date_col,
                       start_date, end_date):
     df = utl.first_last_adj(df, first_row, last_row)
     df = utl.data_to_type(df, date_col=date_col)
     df = utl.date_removal(df, date_col[0], start_date, end_date)
     return df