def read(): if not os.path.isfile(os.path.join(csv_path, csv_file)): logging.info('Creating Vendor Matrix. Populate it and run again') vm = pd.DataFrame(columns=[vmc.vendorkey] + vmc.vmkeys, index=None) vm.to_csv(csv_full_file, index=False, encoding='utf-8') vm = utl.import_read_csv(csv_file, csv_path) return vm
def read_raw_df(self, configfile): try: self.df = utl.import_read_csv(configfile, self.csvpath) except IOError: logging.debug('No Constant Dictionary config') return None self.check_for_dict_col(configfile)
def read(self): if not os.path.isfile(self.full_file_path): logging.info('Creating {}'.format(self.filename)) df = pd.DataFrame(columns=self.columns, index=None) df.to_csv(self.full_file_path, index=False, encoding='utf-8') self.df = utl.import_read_csv(self.filename, self.csvpath) self.df = utl.data_to_type(self.df, str_col=[self.key])
def read(self, configfile): try: self.df = utl.import_read_csv(configfile, self.csvpath) except IOError: logging.debug('No Translational Dictionary config') return None self.clean_df()
def get_column_names_from_raw_files(self): data_sources = self.matrix.get_all_data_sources() df = pd.DataFrame() for source in data_sources: file_name = source.p[vmc.filename] first_row = source.p[vmc.firstrow] missing_cols = [] if os.path.exists(file_name): tdf = utl.import_read_csv(file_name, nrows=first_row + 5) tdf = utl.first_last_adj(tdf, first_row, 0) cols = list(tdf.columns) active_metrics = source.get_active_metrics() for k, v in active_metrics.items(): for c in v: if c not in cols: missing_cols.append({k: c}) else: cols = [] data_dict = { vmc.vendorkey: [source.key], self.raw_columns: [cols], 'missing': [missing_cols] } df = df.append(pd.DataFrame(data_dict), ignore_index=True, sort=False) update_msg = 'Columns and missing columns by key as follows:' logging.info('{}\n{}'.format(update_msg, df.to_string())) self.add_to_analysis_dict(key_col=self.raw_columns, message=update_msg, data=df.to_dict())
def check_plan_error(self, df): plan_names = self.matrix.vendor_set(vm.plan_key)[vmc.fullplacename] er = self.matrix.vendor_set(vm.plan_key)[vmc.filenameerror] edf = utl.import_read_csv(er, utl.error_path) if edf.empty: plan_error_msg = ('No Planned error - all {} ' 'combinations are defined.'.format(plan_names)) logging.info(plan_error_msg) self.add_to_analysis_dict(key_col=self.unknown_col, message=plan_error_msg) return True df = df[df[dctc.PFPN].isin( edf[vmc.fullplacename].values)][plan_names + [vmc.vendorkey]].drop_duplicates() df = vm.full_placement_creation(df, None, dctc.FPN, plan_names) df = df[df[dctc.FPN].isin(edf[dctc.FPN].values)] df = utl.col_removal(df, None, [dctc.FPN]) for col in df.columns: df[col] = "'" + df[col] + "'" df = df.dropna() df_dict = '\n'.join([ '{}{}'.format(k, v) for k, v in df.to_dict(orient='index').items() ]) undefined_msg = 'Undefined placements have the following keys:' logging.info('{}\n{}'.format(undefined_msg, df_dict)) self.add_to_analysis_dict(key_col=self.unknown_col, message=undefined_msg, data=df.to_dict())
def read(self): if not os.path.isfile(self.dict_path_filename): self.create_new_dictionary() self.data_dict = utl.import_read_csv(self.filename, self.dict_path) if not isinstance(self.data_dict, pd.DataFrame) and not self.data_dict: self.data_dict = self.create_new_dictionary() self.clean() self.data_dict = self.data_dict.drop_duplicates()
def __init__(self, config_file='config/cap_config.csv'): self.file_name = 'file_name' self.file_dim = 'file_dim' self.file_metric = 'file_metric' self.proc_dim = 'processor_dim' self.proc_metric = 'processor_metric' self.temp_metric = None self.config = utl.import_read_csv(config_file) self.config = self.config.to_dict(orient='index')
def read(self, configfile): self.df = utl.import_read_csv(configfile, self.csvpath) if self.df.empty: logging.debug('No Relational Dictionary config') return None self.key_list = self.df[dctc.RK].tolist() self.rc = self.df.set_index(dctc.RK).to_dict() self.rc[dctc.DEP] = ({key: list(str(value).split('|')) for key, value in self.rc[dctc.DEP].items()})
def get_raw_df(self): df = utl.import_read_csv(self.p[vmc.filename]) if df is None or df.empty: return df df = utl.add_header(df, self.p[vmc.header], self.p[vmc.firstrow]) df = utl.first_last_adj(df, self.p[vmc.firstrow], self.p[vmc.lastrow]) df = df_transform(df, self.p[vmc.transform]) df = full_placement_creation(df, self.key, dctc.FPN, self.p[vmc.fullplacename]) return df
def read(self, configfile): try: self.df = utl.import_read_csv(configfile, self.csvpath) except IOError: logging.debug('No Constant Dictionary config') return None self.check_for_dict_col(configfile) self.filter_df() self.dict_col_names = self.df[dctc.DICT_COL_NAME].tolist() self.dict_constants = self.df.set_index(dctc.DICT_COL_NAME).to_dict()
def get_file_as_df(temp_path=None): pd.DataFrame() file = os.listdir(temp_path) file_path = os.path.join(temp_path, file[0]) sheet_names = ['Daily Spend', 'Daily Impressions', 'Top Sites'] df = pd.concat(pd.read_excel(file_path, sheet_name=sheet_names, parse_dates=True), ignore_index=True) df.to_csv('tmp/output.csv', encoding='utf-8') temp_file = os.path.join(temp_path, 'output.csv') time.sleep(5) df = utl.import_read_csv(temp_file) shutil.rmtree(temp_path) return df
def merge_df(self, api_df, filename, date_col, start_date, end_date, first_row, last_row, api_merge): if not os.path.isfile(os.path.join(utl.raw_path, filename)): return api_df df = utl.import_read_csv(filename, utl.raw_path) df = self.merge_df_cleaning(df, first_row, last_row, date_col, pd.NaT, end_date - dt.timedelta(days=api_merge)) api_df = self.merge_df_cleaning(api_df, first_row, last_row, date_col, start_date, end_date) df = df.append(api_df, ignore_index=True).reset_index(drop=True) df = utl.add_dummy_header(df, first_row) df = utl.add_dummy_header(df, last_row, location='foot') return df
def agency_fees_calculation(df): logging.info('Calculating Agency Fees') if dctc.AGF not in df.columns: logging.warning('Agency Fee Rates not in dict. ' 'Update dict and run again to calculate agency fees.') return df threshold = utl.import_read_csv(agency_fee_file, utl.config_path) df = utl.data_to_type(df, float_col=[NCF, dctc.AGF]) if not df.empty and not threshold.empty: threshold = threshold[AGENCY_THRESH].fillna(0).astype(float).values[0] threshold = (df[NCF].sum() - threshold) / df[NCF].sum() df[dctc.AGF] = df[dctc.AGF] * threshold df[AGENCY_FEES] = df[dctc.AGF] * df[NCF] return df
def check_plan_error(self, df): plan_names = self.matrix.vendor_set(vm.plan_key)[vmc.fullplacename] er = self.matrix.vendor_set(vm.plan_key)[vmc.filenameerror] edf = utl.import_read_csv(er, utl.error_path) if edf.empty: logging.info('No Planned error.') return True edf[plan_names] = pd.DataFrame( edf[vmc.fullplacename].str.split('_').values.tolist(), columns=plan_names) for col in plan_names: df = df[df[col].isin(edf[col].values)] df = df[plan_names + [vmc.vendorkey]].drop_duplicates() for col in df.columns: df[col] = "'" + df[col] + "'" logging.info('Undefined placements have the following keys: \n' '{}'.format(df))
def get_file_as_df(temp_path=None): df = pd.DataFrame() for x in range(100): logging.info('Checking for file. Attempt {}.'.format(x + 1)) files = os.listdir(temp_path) files = [x for x in files if x[-4:] == '.csv'] if files: files = files[-1] logging.info('File downloaded.') temp_file = os.path.join(temp_path, files) time.sleep(5) df = utl.import_read_csv(temp_file) os.remove(temp_file) break time.sleep(5) shutil.rmtree(temp_path) return df
def get_raw_data(self): full_url = self.create_url() for x in range(1, 101): self.r = self.client.get(full_url) if 'metadata' in self.r.json().keys(): break else: logging.warning('Rate limit exceeded. Pausing. ' 'Response: {}'.format(self.r.json())) time.sleep(60) report_url = ( self.r.json()['metadata']['googleCloudStoragePathForLatestReport']) if report_url: self.df = utl.import_read_csv(report_url, file_check=False, error_bad=False) else: logging.warning('Report does not exist. Create it.') sys.exit(0)
def get_raw_data(self): header = self.create_header() response = None for x in range(1, 101): self.r = self.make_request('get', header=header) response = self.r.json() if response.get('urls') and response['urls']: break else: logging.warning('Waiting for Request. ' 'Response: {}'.format(self.r.json())) time.sleep(60) report_url = (response['urls']) if report_url: logging.info('Found report url, downloading.') self.df = utl.import_read_csv(report_url[0], file_check=False, error_bad=False) else: logging.warning('Report does not exist. Create it.') sys.exit(0)
def vm_update(old_path=utl.config_path, old_file='OldVendorMatrix.csv'): logging.info('Updating Vendor Matrix') shutil.copyfile(csv_full_file, os.path.join(old_path, old_file)) ovm = utl.import_read_csv(filename=old_file, path=old_path) rules = [col for col in ovm.columns if 'RULE_' in col] rule_metrics = [col for col in ovm.columns if '_METRIC' in col] nvm = pd.DataFrame(columns=[vmc.vendorkey] + vmc.vmkeys) vm = nvm.append(ovm, sort=True) if 'FIRSTROWADJ' in vm.columns: vm[vmc.firstrow] = np.where(vm['FIRSTROWADJ'], vm[vmc.firstrow] + 1, vm[vmc.firstrow]) if vmc.autodicplace not in ovm.columns: vm[vmc.autodicplace] = vmc.fullplacename vm = utl.col_removal(vm, 'vm', ['FIRSTROWADJ', 'LASTROWADJ', 'AUTO DICTIONARY'], warn=False) vm = vm.reindex([vmc.vendorkey] + vmc.vmkeys + rules, axis=1) for col in rule_metrics: vm = vm_update_rule_check(vm, col) vm = vm.fillna('') vm = vm.replace('nan', '') vm.to_csv(csv_full_file, index=False, encoding='utf-8')
def load_df_from_file(self): self.df = utl.import_read_csv(self.file)
def read(self): df = utl.import_read_csv(self.file_name, self.file_path) return df
def get_cap_file(self, c): pdf = utl.import_read_csv(c[self.file_name]) p_dict = self.col_dict(c) pdf = pdf.rename(columns=p_dict) return pdf