def reshape_files(data_dir=r'C:\PythonBC\RootData', f_ext='.csv', big_zip=False, all_cols=True, sub_cols=['geo_zip'], **kwargs): ''' 'reshapes' the data files. the original format of the data files provided by root gives us one day's worth of data per file, with all variables present. this function rearranges the format of the files so that each file contains data from all days, but only one column. files are saved via the parquet method (non-human-readable) as <column>.gzip for each column. running this file will 'reshape' all files in data_dir matching *.f_ext. input parameters: data_dir: location of files f_ext: filename extension ('.csv' or '.zip') of files to be loaded. Ignored if big_zip == True. big_zip: True or False. If False, will perform the reshaping on all files in data_dir matching *f_ext. If True, the function will perform the reshaping on bigzipname (hard-coded). all_cols: set True to load all columns from fname. set False if you only want a subset. sub_cols: only implemented if all_cols==False. sub_cols is a list of strings of column names to be loaded. must be comprised of the elements of the mycols list (defined below). note: sub_cols overrides the hard-coded <unwanted> list. ''' # define the file name of the large zip file. assumed structure is a .zip # file with many .csv files inside bigzipname = 'root_ad_auction_dataset_all_data.zip' ### determine which columns to load # list of columns in the file mycols = [ '', 'auction_id', 'inventory_source', 'app_bundle', 'category', 'inventory_interstitial', 'geo_zip', 'platform_bandwidth', 'platform_carrier', 'platform_os', 'platform_device_make', 'platform_device_model', 'platform_device_screen_size', 'rewarded', 'bid_floor', 'bid_timestamp_utc', 'hour', 'day', 'day_of_week', 'month', 'year', 'segments', 'creative_type', 'creative_size', 'spend', 'clicks', 'installs' ] # always drop these columns unwanted = ['auction_id', 'platform_os', 'day', 'month', 'year', 'hour'] #drop the unwanted columns from mycols mycols = [ele for ele in mycols if ele not in unwanted] if not all_cols: # extract the user-defined columns from <mycols> if they are also in <sub_cols> mycols = [ele for ele in mycols if ele in sub_cols] ### determine which files to load # get a list of all the csv files in the data director myfiles = glob.glob(os.path.join(data_dir, '*' + f_ext)) # sort <myfiles> myfiles = sorted(myfiles) # remove the big zip file from <myfiles> if bigzipname in myfiles: myfiles.remove(bigzipname) ### load the columns and files if big_zip: # load the files from the mega zip file print( f"this code not written yet. let's pretend this opened {bigzipname}" ) else: # load files from individual csv/zip files if all_cols: # special case: need to remove the leading '' column for col in mycols[1:]: # loop through all items in mycols print(f'loading {col}...', end='') df_from_each_file = (lf.load_data(data_dir=data_dir, fname=f, all_cols=False, sub_cols=[col], **kwargs) for f in myfiles) df = pd.concat(df_from_each_file, ignore_index=True) print('done') myfname = col + '.gzip' lf.temp_save(df, os.path.join( data_dir, myfname)) # save to disk using parquet method print(f' {myfname} saved') else: for col in mycols: # loop through all items in mycols print(f'loading {col}...', end='') df_from_each_file = (lf.load_data(data_dir=data_dir, fname=f, all_cols=False, sub_cols=[col], **kwargs) for f in myfiles) df = pd.concat(df_from_each_file, ignore_index=True) print('done') myfname = col + '.gzip' lf.temp_save(df, os.path.join( data_dir, myfname)) # save to disk using parquet method print(f' {myfname} saved') print('all done!') return
def local_hour_creator(data_dir=r'C:\PythonBC\RootData', f_ext='.csv', big_zip=False, **kwargs): ''' - calculates the bid_timestamp_local for all CSV files and saves to gzip. - also calculates the local hour, local day, and local day_of_the_week and saves to gzip - converting to local time is a computationally slow/expensive processes, so we load one file at a time, and save those results to a temporary file. once all times in all files have been calculated, we merge the results into a pair of files. saves two files: local_ordinals.gzip: hour, day and day_of_week (int8) bid_timestamp_local.gzip: timestamps ''' # define the file name of the large zip file. assumed structure is a .zip # file with many .csv files inside bigzipname = 'root_ad_auction_dataset_all_data.zip' #initalize zipcode class zc = ZC() #get sorted list of files to loop over myfiles = sorted(glob.glob(os.path.join(data_dir, '*' + f_ext))) # remove the big zip file from <myfiles> if bigzipname in myfiles: myfiles.remove(bigzipname) if os.path.join(data_dir, '2019-04-00.csv') in myfiles: # this file is empty myfiles.remove(os.path.join(data_dir, '2019-04-00.csv')) if big_zip: # load the files from the mega zip file print( f"this code not written yet. let's pretend this opened {bigzipname}" ) else: # load files from individual csv/zip files flist_TS_local = [] flist_locals = [] flist_tz = [] for f in myfiles: print(f'loading {os.path.basename(f)} ... ', end='') # load UTC timestamp and zip code info from CSV files df_TS_utc = lf.load_data(data_dir=data_dir, fname=f, all_cols=False, sub_cols=['bid_timestamp_utc'], **kwargs) df_geozip = lf.load_data(data_dir=data_dir, fname=f, all_cols=False, sub_cols=['geo_zip'], **kwargs) df = pd.concat([df_TS_utc, df_geozip], axis=1) #print(df.head()) # compute local timestamp df['tz'] = zc.zip_to_tz_2(df.geo_zip) #print(df.tz.head()) df_TS_local = zc.shift_tz_wrap(df, style='careful') # compute local hour, day and day of the week df_locals = pd.DataFrame({ 'hour': zc.local_hour(df_TS_local), 'day': zc.local_day(df_TS_local), 'day_of_week': zc.local_weekday(df_TS_local) }) df_locals = df_locals.astype('int8') # drop the bid_timestamp_utc and geo_zip columns df_TS_local = df_TS_local.drop(['bid_timestamp_utc', 'geo_zip'], axis=1) df_tz = pd.DataFrame( df['tz']) # save the tz column as a separate df df_TS_local = df_TS_local.drop(['tz'], axis=1) #save things to disk (temporarily) to save RAM fname_TS_local = os.path.join( data_dir, 'TS_local_' + os.path.basename(f).split('.')[0] + '.gzip') fname_locals = os.path.join( data_dir, 'locals_' + os.path.basename(f).split('.')[0] + '.gzip') fname_tz = os.path.join( data_dir, 'tz_' + os.path.basename(f).split('.')[0] + '.gzip') # remember the file names we use for later flist_TS_local.append(fname_TS_local) flist_locals.append(fname_locals) flist_tz.append(fname_tz) # save to disk using parquet method lf.temp_save(df_TS_local, os.path.join(data_dir, fname_TS_local)) lf.temp_save(df_locals, os.path.join(data_dir, fname_locals)) lf.temp_save(df_tz, os.path.join(data_dir, fname_tz)) print(' done') # now, go through the saved files and combine them into a single large file # we can load all the parquet files at once without issue print('saving summed gzip files ... ', end='') # save bid_timestamp_local df_from_each_file = (lf.temp_load(fname=f) for f in flist_TS_local) df_TS_local = pd.concat(df_from_each_file, ignore_index=True) lf.temp_save(df_TS_local, fname=os.path.join(data_dir, 'bid_timestamp_local.gzip')) print('bid_timestamp_local.gzip ... ', end='') #save local_ordinals (hour, day, day_of_week) df_from_each_file2 = (lf.temp_load(fname=f) for f in flist_locals) df_locals = pd.concat(df_from_each_file2, ignore_index=True) #df_locals = df_locals.astype('category') df_locals = df_locals.astype('int8') lf.temp_save(df_locals, fname=os.path.join(data_dir, 'local_ordinals.gzip')) print('local_ordinals.gzip ... ', end='') #save time zones df_from_each_file3 = (lf.temp_load(fname=f) for f in flist_tz) df_tz = pd.concat(df_from_each_file3, ignore_index=True) df_tz = df_tz.astype('category') lf.temp_save(df_tz, fname=os.path.join(data_dir, 'tz.gzip')) print('tz.gzip') # remove daily gzips from disk when done for f in flist_TS_local: os.remove(f) for f in flist_locals: os.remove(f) for f in flist_tz: os.remove(f) print('temp gzip files deleted') print('all done!') return
Created on Sat May 25 20:21:42 2019 @author: Bryan """ '''************************************************************* |For those of you unfamiliar with loading other python files as| |python molules simply place the file containing the code you | |wish to use as a python module in the same directory as your | |main python code, then import using import file name. As you | |can see you do not need to include the file extension. I have | |included an example of this using Greg's load_file.py code. | |This allows you to group useful sets of functions together in | |a module ultimately making your code more readable. | *************************************************************''' import load_file as lf import matplotlib.pyplot as plt fname = '2019-04-12.csv' fname_zip = '2019-04-27.zip' data_dir = r'D:\Root.Ad.Auction\Data\1' df = lf.load_data(fname=fname, data_dir=data_dir, Verbose=False) #df.plot('hour','spend') x = df['hour'] y = df['spend'] plt.scatter(x, y) #plt.xlim(8,12) plt.show()
cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #%% directory = 'C:/Users/Tebe/Documents/Root Ad Data/csvs' fname = '2019-04-20.csv' print(fname) # Load files from several days df = lf.load_data(fname=fname, data_dir=directory) for i in np.arange(1, 2): fn = fname.split('0.')[0] + str(i) + '.csv' print(fn) df = df.append(lf.load_data(fname=fn, data_dir=directory)) #%% imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='UNKNOWN') imp.fit(df[['platform_device_screen_size']]) df['platform_device_screen_size'] = imp.transform( df[['platform_device_screen_size']]) imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='-1')
if reverse: tokens = np.flip(tokens, axis=1) truncating = 'pre' else: truncating = 'post' if padding: tokens = pad_sequences(tokens, maxlen=self.max_tokens, padding='pre', truncating=truncating) return tokens x = load_file.load_data(english=True) y = load_file.load_data(english=False) num_words = 20000 mark_start = 'aaaa ' mark_end = ' zzzz' #print("aa") tokenizer_source = TokenizerWrap(texts=x, padding='pre', reverse=True, num_words=num_words) tokenizer_dest = TokenizerWrap(texts=y, padding='post', reverse=False, num_words=num_words) #idx=3 tokens_src = tokenizer_source.tokens_padded
# -*- coding: utf-8 -*- """ Created on Sat May 25 20:21:42 2019 @author: Bryan """ '''************************************************************* |For those of you unfamiliar with loading other python files as| |python molules simply place the file containing the code you | |wish to use as a python module in the same directory as your | |main python code, then import using import file name. As you | |can see you do not need to include the file extension. I have | |included an example of this using Greg's load_file.py code. | |This allows you to group useful sets of functions together in | |a module ultimately making your code more readable. | *************************************************************''' import load_file as lf fname = '2019-04-27.csv' fname_zip = '2019-04-27.zip' data_dir = r'D:\Root.Ad.Auction\Data\1' df = lf.load_data(fname=fname, data_dir=data_dir, Verbose=True)