Пример #1
0
def local_hour_creator(data_dir=r'C:\PythonBC\RootData',
                       f_ext='.csv',
                       big_zip=False,
                       **kwargs):
    '''
	- calculates the bid_timestamp_local for all CSV files and saves to gzip.
	- also calculates the local hour, local day, and local day_of_the_week and
		saves to gzip
	- converting to local time is a computationally slow/expensive processes, 
		so we load one file at a time, and save those results to a temporary
		file. once all times in all files have been calculated, we merge the 
		results into a pair of files.
	saves two files:
		local_ordinals.gzip: hour, day and day_of_week (int8)
		bid_timestamp_local.gzip: timestamps
	
	'''
    # define the file name of the large zip file. assumed structure is a .zip
    # file with many .csv files inside
    bigzipname = 'root_ad_auction_dataset_all_data.zip'

    #initalize zipcode class
    zc = ZC()

    #get sorted list of files to loop over
    myfiles = sorted(glob.glob(os.path.join(data_dir, '*' + f_ext)))

    # remove the big zip file from <myfiles>
    if bigzipname in myfiles:
        myfiles.remove(bigzipname)
    if os.path.join(data_dir,
                    '2019-04-00.csv') in myfiles:  # this file is empty
        myfiles.remove(os.path.join(data_dir, '2019-04-00.csv'))

    if big_zip:  # load the files from the mega zip file
        print(
            f"this code not written yet. let's pretend this opened {bigzipname}"
        )
    else:  # load files from individual csv/zip files
        flist_TS_local = []
        flist_locals = []
        flist_tz = []
        for f in myfiles:
            print(f'loading {os.path.basename(f)} ... ', end='')
            # load UTC timestamp and zip code info from CSV files
            df_TS_utc = lf.load_data(data_dir=data_dir,
                                     fname=f,
                                     all_cols=False,
                                     sub_cols=['bid_timestamp_utc'],
                                     **kwargs)
            df_geozip = lf.load_data(data_dir=data_dir,
                                     fname=f,
                                     all_cols=False,
                                     sub_cols=['geo_zip'],
                                     **kwargs)
            df = pd.concat([df_TS_utc, df_geozip], axis=1)
            #print(df.head())

            # compute local timestamp
            df['tz'] = zc.zip_to_tz_2(df.geo_zip)
            #print(df.tz.head())
            df_TS_local = zc.shift_tz_wrap(df, style='careful')

            # compute local hour, day and day of the week
            df_locals = pd.DataFrame({
                'hour':
                zc.local_hour(df_TS_local),
                'day':
                zc.local_day(df_TS_local),
                'day_of_week':
                zc.local_weekday(df_TS_local)
            })
            df_locals = df_locals.astype('int8')

            # drop the bid_timestamp_utc and geo_zip columns
            df_TS_local = df_TS_local.drop(['bid_timestamp_utc', 'geo_zip'],
                                           axis=1)
            df_tz = pd.DataFrame(
                df['tz'])  # save the tz column as a separate df
            df_TS_local = df_TS_local.drop(['tz'], axis=1)

            #save things to disk (temporarily) to save RAM
            fname_TS_local = os.path.join(
                data_dir,
                'TS_local_' + os.path.basename(f).split('.')[0] + '.gzip')
            fname_locals = os.path.join(
                data_dir,
                'locals_' + os.path.basename(f).split('.')[0] + '.gzip')
            fname_tz = os.path.join(
                data_dir, 'tz_' + os.path.basename(f).split('.')[0] + '.gzip')

            # remember the file names we use for later
            flist_TS_local.append(fname_TS_local)
            flist_locals.append(fname_locals)
            flist_tz.append(fname_tz)

            # save to disk using parquet method
            lf.temp_save(df_TS_local, os.path.join(data_dir, fname_TS_local))
            lf.temp_save(df_locals, os.path.join(data_dir, fname_locals))
            lf.temp_save(df_tz, os.path.join(data_dir, fname_tz))
            print(' done')
        # now, go through the saved files and combine them into a single large file
        # we can load all the parquet files at once without issue
        print('saving summed gzip files ... ', end='')

        # save bid_timestamp_local
        df_from_each_file = (lf.temp_load(fname=f) for f in flist_TS_local)
        df_TS_local = pd.concat(df_from_each_file, ignore_index=True)
        lf.temp_save(df_TS_local,
                     fname=os.path.join(data_dir, 'bid_timestamp_local.gzip'))
        print('bid_timestamp_local.gzip ... ', end='')

        #save local_ordinals (hour, day, day_of_week)
        df_from_each_file2 = (lf.temp_load(fname=f) for f in flist_locals)
        df_locals = pd.concat(df_from_each_file2, ignore_index=True)
        #df_locals = df_locals.astype('category')
        df_locals = df_locals.astype('int8')
        lf.temp_save(df_locals,
                     fname=os.path.join(data_dir, 'local_ordinals.gzip'))
        print('local_ordinals.gzip ... ', end='')

        #save time zones
        df_from_each_file3 = (lf.temp_load(fname=f) for f in flist_tz)
        df_tz = pd.concat(df_from_each_file3, ignore_index=True)
        df_tz = df_tz.astype('category')
        lf.temp_save(df_tz, fname=os.path.join(data_dir, 'tz.gzip'))
        print('tz.gzip')

        # remove daily gzips from disk when done
        for f in flist_TS_local:
            os.remove(f)

        for f in flist_locals:
            os.remove(f)

        for f in flist_tz:
            os.remove(f)
        print('temp gzip files deleted')
        print('all done!')
    return
Пример #2
0
df2 = lf.temp_load(os.path.join(data_dir,'clicks.gzip'))
df3 = lf.temp_load(os.path.join(data_dir,'bid_timestamp_utc.gzip'))
frames = [df1, df2, df3]
df = pd.concat(frames, axis=1)
df = df.dropna(subset=['geo_zip']) # drop NaN values of zip codes
zc = ZC()
df['tz'] = zc.zip_to_tz_2(df.geo_zip)
df['tz'] = df.tz.astype('category')

df['bid_timestamp_local'] = zc.shift_tz_wrap(df) # compute local time
df['hour'] = zc.local_hour(df) # compute local hour
'''

#%% track clicks vs date
fname = 'clicks.gzip'
df_clicks = lf.temp_load(os.path.join(data_dir, fname))

fname = 'local_ordinals.gzip'
df_localords = lf.temp_load(os.path.join(data_dir, fname))

fname = 'installs.gzip'
df_installs = lf.temp_load(os.path.join(data_dir, fname))

fname = 'state.gzip'
df_state = lf.temp_load(os.path.join(data_dir, fname))

frames = [df_localords, df_clicks, df_installs, df_state]
df = pd.concat(frames, axis=1)

ax = mp.make_countplot(df, col='day', count='clicks', order=False)
ax = mp.make_countplot(df, col='hour', count='clicks', order=False)
Пример #3
0
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.metrics import recall_score

sys.path.append(os.getcwd())
import load_file as lf
import model_metrics as mm

#%%
#Load all gzip data into large dataframe
data_directory = r'D:\Root Data\csvs'
#Load ALL of the data
df = pd.DataFrame()
for f in tqdm(glob.glob(os.path.join(data_directory, '*.gzip'))):
    df = pd.concat([df, lf.temp_load(fname=f)], axis=1)
df = df.sample(frac=1)
df = df.drop([
    'app_bundle', 'bid_timestamp_utc', 'tz', 'spend', 'installs',
    'bid_timestamp_local'
],
             axis=1)
df['inventory_interstitial'] = df['inventory_interstitial'].astype(int)
df['rewarded'] = df['rewarded'].astype(int)
df['clicks'] = df['clicks'].astype(int)
#%%
##Process data in minibatches
#chunksize=100000
#model = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1)
#hash_encoder = ce.HashingEncoder(n_components=2000)
#loo = ce.LeaveOneOutEncoder()
Пример #4
0
      
bandwidth_28 = pd.get_dummies(df_28['platform_bandwidth'])
#device_make_28 = pd.get_dummies(df_28['platform_device_make'])
X_28 = df_28['hour']
X_28 = pd.concat([X_28, bandwidth_28], axis=1)
y_28 = df_28.clicks

print(f'score (train on 27th, test on 28th) = {clf.score(X_28,y_28)}')

from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_28, y_28, scoring='recall_macro', cv=5) 
'''

#%% determine number of categories

df_cats = lf.temp_load(os.path.join(data_dir, 'category.gzip'))
cats = df_cats.category.unique().tolist()
cats = [x for x in cats if str(x) != 'nan']  # remove nans
allcats = []
for i in cats:
    #allcats.append( i.split(',')) # comma-separated
    allcats.append(i.split(' '))  # space-separated
allcats = [item for sublist in allcats
           for item in sublist]  # flatten list of lists
allcats = list(dict.fromkeys(allcats))  # remove duplicates
print(f'there are {len(cats)} unique bundles of categories in this dataset')
print(f'there are {len(allcats)} unique IDF categories in this dataset')

#for c in allcats:
#	print(c)