Python KaggleDatasets示例，kaggle_datasets.KaggleDatasets Python示例

示例#1

0

显示文件

文件： tools.py 项目： gdtydm/mlxgtools

def get_gcs_path_of_kaggle_data(data_name, is_private=False):
    if is_private:
        # Step 1: Get the credential from the Cloud SDK
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        user_credential = user_secrets.get_gcloud_credential()
        # Step 2: Set the credentials
        user_secrets.set_tensorflow_credential(user_credential)

    # Step 3: Use a familiar call to get the GCS path of the dataset
    from kaggle_datasets import KaggleDatasets
    GCS_DS_PATH = KaggleDatasets().get_gcs_path(data_name)
    return GCS_DS_PATH

示例#2

0

显示文件

TEST_DIR = 'E:/KaggleChallenges/cassava-leaf-disease-classification/test_images'

sub = pd.read_csv(f'{BASE_DIR}sample_submission.csv')

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

AUTOTUNE = tf.data.experimental.AUTOTUNE()
GCS_PATH = KaggleDatasets().get_gcs_path('cassava-leaf-disease-classification')
GCS_PATH_AUG = KaggleDatasets().get_gcs_path('cassava-aug')
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]
CLASSES = ['0', '1', '2', '3', '4']
EPOCHS = 25

TRAINING_FILENAMES = np.array(
    tf.io.gfile.glob(GCS_PATH + '/train_tfrecords/ld_train*.tfrec'))
TEST_FILENAMES = np.array(
    tf.io.gfile.glob(GCS_PATH + '/test_tfrecords/ld_test*.tfrec'))
AUG_FILENAME = np.array(tf.io.gfile.glob(GCS_PATH_AUG + '/cassva_aug_*.tfrec'))


def count_data_items(filenames):
    n = [

示例#3

0

显示文件

文件： utils.py 项目： sarthak-314/Kaggle

def get_gcs_path(comp_name):
    return KaggleDatasets().get_gcs_path(comp_name)

示例#4

0

显示文件

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print('Replicas:', strategy.num_replicas_in_sync)

GCS_DS_PATH = KaggleDatasets().get_gcs_path('flower-classification-with-tpus')
MORE_IMAGES_GCS_DS_PATH = KaggleDatasets().get_gcs_path(
    'tf-flower-photo-tfrec')
print(GCS_DS_PATH, '\n', MORE_IMAGES_GCS_DS_PATH)
#!ls -l /kaggle/input/tf-flower-photo-tfrec/*/tfrecords-jpeg-224x224/*.tfrec
#!ls -l /kaggle/input/tf-flower-photo-tfrec/imagenet/tfrecords-jpeg-224x224/*.tfrec
#!ls -l /kaggle/input/tf-flower-photo-tfrec/inaturalist/tfrecords-jpeg-224x224/*.tfrec
#!ls -l /kaggle/input/tf-flower-photo-tfrec/openimage/tfrecords-jpeg-224x224/*.tfrec
#!ls -l /kaggle/input/tf-flower-photo-tfrec/oxford_102/tfrecords-jpeg-224x224/*.tfrec
#!ls -l /kaggle/input/tf-flower-photo-tfrec/tf_flowers/tfrecords-jpeg-224x224/*.tfrec

start_time = datetime.now()
print('Time now is', start_time)
end_training_by_tdelta = timedelta(seconds=8400)
this_run_file_prefix = start_time.strftime('%Y%m%d_%H%M_')
print(this_run_file_prefix)

示例#5

0

显示文件

文件： test_datasets.py 项目： Ankur3107/dockers

 def test_no_token_fails(self):
     env = EnvironmentVarGuard()
     env.unset(_KAGGLE_USER_SECRETS_TOKEN_ENV_VAR_NAME)
     with env:
         with self.assertRaises(CredentialError):
             client = KaggleDatasets()

示例#6

0

显示文件

文件： test_datasets.py 项目： Ankur3107/dockers

 def call_get_gcs_path():
     client = KaggleDatasets()
     with self.assertRaises(BackendError):
         gcs_path = client.get_gcs_path()

示例#7

0

显示文件

    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)


# In[10]:


AUTO = tf.data.experimental.AUTOTUNE

# Data access
if not localEnvironment  and not IN_COLAB:
    GCS_DS_PATH = KaggleDatasets().get_gcs_path("jigsaw-multilingual-toxic-comment-classification")

# Configuration
NEXAMPLESPEREPOCH=240000
EPOCHS = 10
if strategy.num_replicas_in_sync==1:
    BATCH_SIZE = 4 * strategy.num_replicas_in_sync
else:
    BATCH_SIZE = 16 * strategy.num_replicas_in_sync

MAX_LEN = 192
# MAX_LEN = 512

MODEL = 'jplu/tf-xlm-roberta-large'
if localEnvironment:
    MODEL='jplu/tf-xlm-roberta-base'

示例#8

0

显示文件

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

SEQUENCE_LENGTH = 128

# Note that private datasets cannot be copied - you'll have to share any pretrained models
# you want to use with other competitors!

GCS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')
BERT_GCS_PATH = KaggleDatasets().get_gcs_path('bert-multi')
BERT_GCS_PATH_SAVEDMODEL = BERT_GCS_PATH + "/bert_multi_from_tfhub"
TEST_DATASET_SIZE = 63812


def multilingual_bert_model(max_seq_length=SEQUENCE_LENGTH, trainable_bert=True):
    """Build and return a multilingual BERT model and tokenizer."""
    input_word_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="all_segment_id")

    # Load a SavedModel on TPU from GCS. This model is available online at

示例#9

0

显示文件

文件： train_fold.py 项目： Abhinav-97/Melanoma-Detection

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

GCS_PATH = [None] * FOLDS
GCS_PATH2 = [None] * FOLDS
for i, k in enumerate(IMG_SIZES):
    GCS_PATH[i] = KaggleDatasets().get_gcs_path('melanoma-%ix%i' % (k, k))
    GCS_PATH2[i] = KaggleDatasets().get_gcs_path('isic2019-%ix%i' % (k, k))
files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH[0] +
                                                '/train*.tfrec')))
files_test = np.sort(np.array(tf.io.gfile.glob(GCS_PATH[0] + '/test*.tfrec')))

ROT_ = 180.0
SHR_ = 2.0
HZOOM_ = 8.0
WZOOM_ = 8.0
HSHIFT_ = 8.0
WSHIFT_ = 8.0


def get_mat(rotation, shear, height_zoom, width_zoom, height_shift,
            width_shift):

示例#10

0

显示文件

文件： test_datasets.py 项目： case-k-git/env

 def call_get_gcs_path():
     client = KaggleDatasets()
     gcs_path = client.get_gcs_path()

示例#11

0

显示文件

文件： bert.py 项目： pennz/kaggle_runner

if tpu_resolver is None:
    if DEBUG:
        BATCH_SIZE = 32 * 2
    else:
        BATCH_SIZE = 32 * 32
elif strategy is not None:
    BATCH_SIZE = 32 * strategy.num_replicas_in_sync

if tpu_resolver is None:
    DATA_PATH = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"
    BERT_BASE_DIR = "/kaggle/input/bert-pretrained-models" + \
        '/multi_cased_L-12_H-768_A-12' + '/multi_cased_L-12_H-768_A-12'
else:
    from kaggle_datasets import KaggleDatasets
    GCS_DS_PATH = KaggleDatasets().get_gcs_path(
        'jigsaw-multilingual-toxic-comment-classification')
    GCS_BERT_PRETRAINED = KaggleDatasets().get_gcs_path('bert-pretrained-models') + \
        '/multi_cased_L-12_H-768_A-12'+'/multi_cased_L-12_H-768_A-12'

    DATA_PATH = GCS_DS_PATH + '/'
    BERT_BASE_DIR = GCS_BERT_PRETRAINED


def pickle_data(max_seq_length=128,
                bert_base_dir=BERT_BASE_DIR,
                output="features.pkl"):
    # --vocab_file="$BERT_BASE_DIR/vocab.txt" \
    # --init_checkpoint="$BERT_BASE_DIR/bert_model.ckpt" \
    # --bert_config_file="$BERT_BASE_DIR/bert_config.json" \
    load_data("pickle",
              "/tmp/input.txt",

示例#12

0

显示文件

文件： EfficientnetB6.py 项目： gnbrganchan/Kaggle

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn
import PIL
from kaggle_datasets import KaggleDatasets
from tqdm import tqdm

BASEPATH = "../input/siim-isic-melanoma-classification"
df_train = pd.read_csv(os.path.join(BASEPATH, 'train.csv'))
df_test  = pd.read_csv(os.path.join(BASEPATH, 'test.csv'))
df_sub   = pd.read_csv(os.path.join(BASEPATH, 'sample_submission.csv'))

GCS_PATH    = KaggleDatasets().get_gcs_path('melanoma-384x384')
files_train = tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')
files_test  = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/test*.tfrec')))
GCS_PATH2    = KaggleDatasets().get_gcs_path('isic2019-384x384')
files_train += tf.io.gfile.glob([GCS_PATH2 + '/train%.2i*.tfrec'%(x*2+1) for x in range(15)]) # 2019
files_train += tf.io.gfile.glob([GCS_PATH2 + '/train%.2i*.tfrec'%(x*2) for x in range(15)]) # 2018"
files_train = np.sort(np.array(files_train))

DEVICE = "TPU"

bs = 16

CFG = dict(
    net_count         =   7,
    batch_size        =  bs,

示例#13

0

显示文件

文件： train_img.py 项目： alexriedel1/HPA-Single-Cell-Classification-Place-21-Solution

    slices = paths if labels is None else (paths, labels)

    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)

    return dset


strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * 16  #############WAS 16
GCS_DS_PATH = KaggleDatasets().get_gcs_path("hpa-768768")
GCS_DS_PATH_EXT_DATA = KaggleDatasets().get_gcs_path(
    "hpa-public-768-excl-0-16")

load_dir = f"/kaggle/input/hpa-768768/"
df = pd.read_csv(os.getcwd() + 'train_data/df_green.csv')
df["ID"] = df["ID"].str.replace('_green', '')
label_cols = df.columns[2:21]
paths = GCS_DS_PATH + '/' + df['ID']
labels = df[label_cols].values

df_ext = pd.read_csv('train_data/hpa_public_excl_0_16_768.csv', index_col=0)
df_ext = df_ext.drop(['Cellline'], axis=1)
df_ext["Labels_list"] = df_ext["Label"].str.split("|").apply(
    lambda x: [int(i) for i in x])

示例#14

0

显示文件

           rot=180.0,
           shr=2.0,
           hzoom=8.0,
           wzoom=8.0,
           hshift=8.0,
           wshift=8.0,
           optimizer='adam',
           label_smooth_fac=0.05,
           tta_steps=25)

BASEPATH = "../input/siim-isic-melanoma-classification"
df_train = pd.read_csv(os.path.join(BASEPATH, 'train.csv'))
df_test = pd.read_csv(os.path.join(BASEPATH, 'test.csv'))
df_sub = pd.read_csv(os.path.join(BASEPATH, 'sample_submission.csv'))

GCS_PATH = KaggleDatasets().get_gcs_path(
    'melanoma-%ix%i' % (args["image_size"], args["image_size"]))

files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + "/train*.tfrec")))
files_test = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + "/test*.tfrec")))

if DEVICE == "TPU":
    print("connecting to TPU...")

    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print("Running on TPU", tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:

示例#15

0

显示文件

文件： petal_helper.py 项目： Saukha/Patels-to-the-Metals-Classification

    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# %% [markdown]
# # Competition data access
#
# In order to train a model on a TPU, we need to store the dataset in Google Cloud Storage (GCS).
#
# The code cell below will copy the flowers dataset to a GCS bucket co-located with the TPU.

# %% [code]
GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')

# %% [markdown]
# # Configuration

# %% [code]
IMAGE_SIZE = [512, 512]
EPOCHS = 12
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

GCS_PATH = GCS_DS_PATH + '/tfrecords-jpeg-512x512'

TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec')

示例#16

0

显示文件

# %% [code]
#针对不同硬件产生不同配置
AUTO = tf.data.experimental.AUTOTUNE
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    GCS_DS_PATH = KaggleDatasets().get_gcs_path()
    print(GCS_DS_PATH)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.
print("REPLICAS: ", strategy.num_replicas_in_sync)

# %% [code]
#path、label都作为全局变量

#超参数，根据数据和策略调参
BATCH_SIZE = 1 * strategy.num_replicas_in_sync
img_size = 768
EPOCHS = 25
lr_if_without_scheduler = 0.0003
nb_classes = 3
print('BATCH_SIZE是：',BATCH_SIZE)

示例#17

0

显示文件

文件： test_datasets.py 项目： Ankur3107/dockers

 def call_get_gcs_path():
     client = KaggleDatasets()
     gcs_path = client.get_gcs_path()
     self.assertEqual(gcs_path, _AUTOML_GCS_BUCKET)

示例#18

0

显示文件

文件： find-bad-guy.py 项目： peanutzhen/flower-classification-with-tpus

    strategy = tf.distribute.get_strategy()


AUTO = tf.data.experimental.AUTOTUNE

BATCH_SIZE = 16 * strategy.num_replicas_in_sync # 根据cpu/tpu自动调整batch大小

IMAGE_SIZE = [512,512]
WIDTH = IMAGE_SIZE[0]
HEIGHT = IMAGE_SIZE[1]
CHANNELS = 3

# 加载官方数据集
try: #Running in Kaggle kernel
    from kaggle_datasets import KaggleDatasets
    BASE = KaggleDatasets().get_gcs_path('flower-classification-with-tpus')
except ModuleNotFoundError: # Running at my mac
    BASE = "/Users/astzls/Downloads/flower"

PATH_SELECT = { # 根据图像大小选择路径
    192: BASE + '/tfrecords-jpeg-192x192',
    224: BASE + '/tfrecords-jpeg-224x224',
    331: BASE + '/tfrecords-jpeg-331x331',
    512: BASE + '/tfrecords-jpeg-512x512'
}
IMAGE_PATH = PATH_SELECT[IMAGE_SIZE[0]]


VALIDATION_FILENAMES = tf.io.gfile.glob(IMAGE_PATH + '/val/*.tfrec')

# 统计image数量

示例#19

0

显示文件

AUTO = tf.data.experimental.AUTOTUNE
#超参数定义
#strategy来自tpu_settings.py
BATCH_SIZE = 8 * strategy.num_replicas_in_sync  # 根据cpu/tpu自动调整batch大小
LEARNING_RATE = 9.888029308058321e-05
EPOCHS = 8  # 训练周次

IMAGE_SIZE = [512, 512]  #手动修改此处图像大小，进行训练
WIDTH = IMAGE_SIZE[0]
HEIGHT = IMAGE_SIZE[1]
CHANNELS = 3

# 加载官方数据集
try:  #Running in Kaggle kernel
    from kaggle_datasets import KaggleDatasets
    BASE = KaggleDatasets().get_gcs_path('flower-classification-with-tpus')
except ModuleNotFoundError:  # 本地训练
    BASE = "替换成自己的路径"

IMAGE_PATH = BASE + '/tfrecords-jpeg-512x512'

#此处利用tf.io的库函数
#读出文件集方式很多种，也可以用os+re库进行
TRAINING_FILENAMES = tf.io.gfile.glob(IMAGE_PATH + '/train/*.tfrec')
VALIDATION_FILENAMES = tf.io.gfile.glob(IMAGE_PATH + '/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(IMAGE_PATH + '/test/*.tfrec')

# 牛津大学花蕊数据（原：7310 数据清洗后：2718）Oxford 102 for TPU competition
# 使用后，精确度提升2%
try:  #Running in Kaggle kernel
    from kaggle_datasets import KaggleDatasets