Пример #1
0
def GCStoLocal(filename):
    sshColab.download_to_colab(
        project,
        bucket_name,
        destination_directory='/kaggle/working',
        remote_blob_path=f'tps-apr-2021-label/{filename}',
        local_file_name=f'{filename}')
    return pickle.load(open(f'/kaggle/working/{filename}', 'rb'))
# TODO data.loc[data['Age'].notnull(), 'Age_rg'] = rank_gauss(data.loc[data['Age'].notnull(), 'Age'].values)
# TODO data.loc[data['Fare'].notnull(), 'Fare_rg'] = rank_gauss(data.loc[data['Fare'].notnull(), 'Fare'].values)

#DONE!
os.chdir('/kaggle/working')
pickle.dump(data, open('1parsed_data.pkl', 'wb'))
sshColab.upload_to_gcs(project, bucket_name,
                       'tps-apr-2021-label/1parsed_data.pkl',
                       '/kaggle/working/1parsed_data.pkl')

#%%
# ------------------------------------ Age ----------------------------------- #

sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
    remote_blob_path='tps-apr-2021-label/1parsed_data.pkl',
    local_file_name='1parsed_data.pkl')
parsed_data = pickle.load(open('/kaggle/working/1parsed_data.pkl', 'rb'))
data = parsed_data.copy()
data = data.fillna(-1)

the_col = 'Age'
idx_present = data.loc[data[the_col] != -1, :].index.tolist()
idx_missing = data.loc[data[the_col] == -1, :].index.tolist()
data[the_col] = data[the_col].replace(-1, np.nan)
x_train_full = data.loc[idx_present, :].drop(the_col, axis=1)
y_train_full = data.loc[idx_present, :][the_col]
x_test_full = data.loc[idx_missing, :].drop(the_col, axis=1)

cat_features = [
Пример #3
0
train_df = pd.read_csv(
    '/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv(
    '/kaggle/input/tabular-playground-series-apr-2021/test.csv')

train_label = train_df['Survived']
train_id = train_df['PassengerId']
test_id = test_df['PassengerId']
del train_df['Survived'], train_df['PassengerId']
del test_df['PassengerId']

train_rows = train_df.shape[0]

sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
    remote_blob_path='tps-apr-2021-label/1clean_data.pkl',
    local_file_name='1clean_data.pkl')
data = pickle.load(open('/kaggle/working/1clean_data.pkl', 'rb'))

sshColab.download_to_colab(
    project,
    bucket_name,
    destination_directory='/kaggle/working',
    remote_blob_path='tps-apr-2021-label/2missing_code_map.pkl',
    local_file_name='2missing_code_map.pkl')
missing_code_map = pickle.load(
    open('/kaggle/working/2missing_code_map.pkl', 'rb'))

sshColab.download_to_colab(
    project,
Пример #4
0
train_rows = train_df.shape[0]

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('Time taken : %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


file_to_load = '11dataframe_xgboost_based_trim.pkl'
sshColab.download_to_colab(project, bucket_name, 
    destination_directory = '/kaggle/working', 
    remote_blob_path=f'tps-apr-2021-label/{file_to_load}', 
    local_file_name=file_to_load) 
df = pickle.load(open(f'/kaggle/working/{file_to_load}', 'rb'))

file_to_load = '11cols_tuple.pkl'
sshColab.download_to_colab(project, bucket_name, 
    destination_directory = '/kaggle/working', 
    remote_blob_path=f'tps-apr-2021-label/{file_to_load}', 
    local_file_name=file_to_load) 
cat_cols, num_cols = pickle.load(open(f'/kaggle/working/{file_to_load}', 'rb'))

def feature_distribution():
    plt.figure(figsize=(16, 32))
    for i, col in enumerate(df.columns.tolist()):
        ax = plt.subplot(10, 2, i + 1)
        ax.set_title(col)