Пример #1
0
    def __init_dataset__(self, type, emb_loss_names):
        init_as_triplet = self.triplet in emb_loss_names or type == self.triplet
        if self.train_ds is not None and init_as_triplet == self.is_triplet_dataset and self.is_distill_ds == False:
            return

        dataset_params = {
            "data_path": self.data_path,
            "batch_size": self.batch_size,
            "random_status": self.random_status,
            "image_per_class": self.image_per_class,
            "teacher_model_interf": self.teacher_model_interf,
        }
        if init_as_triplet:
            print(">>>> Init triplet dataset...")
            if self.data_path.endswith(".tfrecord"):
                print(
                    ">>>> Combining tfrecord dataset with triplet is NOT recommended."
                )
                self.train_ds, self.steps_per_epoch = data.prepare_distill_dataset_tfrecord(
                    **dataset_params)
            else:
                aa = data.Triplet_dataset(**dataset_params)
                self.train_ds, self.steps_per_epoch = aa.ds, aa.steps_per_epoch
            self.is_triplet_dataset = True
        else:
            print(">>>> Init softmax dataset...")
            if self.data_path.endswith(".tfrecord"):
                self.train_ds, self.steps_per_epoch = data.prepare_distill_dataset_tfrecord(
                    **dataset_params)
            else:
                self.train_ds, self.steps_per_epoch = data.prepare_dataset(
                    **dataset_params)
            self.is_triplet_dataset = False

        if tf.distribute.has_strategy():
            self.train_ds = self.train_ds.with_options(self.data_options)

        label_spec = self.train_ds.element_spec[-1]
        if isinstance(label_spec, tuple):
            # dataset with embedding values
            self.is_distill_ds = True
            self.teacher_emb_size = label_spec[0].shape[-1]
            self.classes = label_spec[1].shape[-1]
            if type == self.distill:
                # Loss is distill type: [label * n, embedding]
                self.train_ds = self.train_ds.map(
                    lambda xx, yy: (xx, yy[1:] * len(emb_loss_names) + yy[:1]))
            elif (self.distill in emb_loss_names and len(emb_loss_names) != 1
                  ) or (self.distill not in emb_loss_names
                        and len(emb_loss_names) != 0):
                # Will attach distill loss as embedding loss, and there are other embedding losses: [embedding, label * n]
                label_data_len = len(
                    emb_loss_names) if self.distill in emb_loss_names else len(
                        emb_loss_names) + 1
                self.train_ds = self.train_ds.map(
                    lambda xx, yy: (xx, yy[:1] + yy[1:] * label_data_len))
        else:
            self.is_distill_ds = False
            self.classes = label_spec.shape[-1]
Пример #2
0
 def __init_dataset_softmax__(self):
     if self.train_ds == None or self.is_triplet_dataset == True:
         print(">>>> Init softmax dataset...")
         self.train_ds = data.prepare_dataset(
             self.data_path,
             batch_size=self.batch_size,
             random_status=self.random_status,
             random_crop=(100, 100, 3))
         self.classes = self.train_ds.element_spec[-1].shape[-1]
         self.is_triplet_dataset = False
Пример #3
0
 def __init_dataset_softmax__(self):
     if self.train_ds == None or self.is_triplet_dataset == True:
         print(">>>> Init softmax dataset...")
         self.train_ds = data.prepare_dataset(
             self.data_path,
             batch_size=self.batch_size,
             random_status=self.random_status,
             random_crop=(100, 100, 3),
             cache=self.dataset_cache)
         label_spec = self.train_ds.element_spec[-1]
         if isinstance(label_spec, tuple):
             # dataset with embedding values
             self.is_distiller = True
             self.classes = label_spec[0].shape[-1]
         else:
             self.is_distiller = False
             self.classes = label_spec.shape[-1]
         self.is_triplet_dataset = False
Пример #4
0
 def __init_dataset__(self, type):
     if type == self.triplet:
         if self.train_ds == None or self.is_triplet_dataset == False:
             print(">>>> Init triplet dataset...")
             # batch_size = int(self.batch_size / 4 * 1.5)
             batch_size = self.batch_size // 4
             tt = data.Triplet_dataset(self.data_path,
                                       batch_size=batch_size,
                                       random_status=self.random_status)
             self.train_ds, self.steps_per_epoch = tt.train_dataset, tt.steps_per_epoch
             self.is_triplet_dataset = True
     else:
         if self.train_ds == None or self.is_triplet_dataset == True:
             print(">>>> Init softmax dataset...")
             self.train_ds, self.steps_per_epoch, self.classes = data.prepare_dataset(
                 self.data_path,
                 batch_size=self.batch_size,
                 random_status=self.random_status)
             self.is_triplet_dataset = False
Пример #5
0
 def prepare_forecast_data(self, data_path='./data/forecast/'):
     # Download and store all sites data
     start_date, end_date = data.get_data_start_and_end_date('forecast')
     #Clean the old files from firectory
     data.clean_directory(data_path)
     #Download for training model
     for site in self.site_ids:
         data.download_site_data(site, start_date, end_date, data_path)
     forecast_df = data.prepare_dataset(data_path)
     #         print(forecast_df.head())
     # FORECAST DATASET FIELDS NEED TO REARRANGE
     forecast_df = forecast_df[['NO', 'PM10', 'PM2.5', 'CO']]
     #         print(forecast_df.head())
     #         print( forecast_df.shape )
     forecast_period = (datetime.strptime(
         np.datetime_as_string(forecast_df.index[-1:].values[0], unit='s'),
         '%Y-%m-%dT%H:%M:%S') +
                        timedelta(minutes=30)).strftime("%d-%b-%Y %H:%M:%S")
     #         print('Forecast period: {}'.format(forecast_period))
     forecast_dataset = data.split_forecast_dataset(forecast_df.values)
     return forecast_period, forecast_dataset
Пример #6
0
def load_data(features_dict):
    dataset = f'movielens/{FLAGS.dataset}-ratings'
    ratings = tfds.load(dataset, split='train', data_dir=FLAGS.data_dir)

    # Prepare for binarization
    ratings.filter(lambda x: x['user_rating'] != 3.0)

    ratings = prepare_dataset(ratings, features_dict)

    # Cache for efficiency
    ratings = ratings.cache(tempfile.NamedTemporaryFile().name)

    features = features_by_type(features_dict)
    categorical_features = features['string'] + features['integer']
    vocabularies = get_vocabularies(ratings, categorical_features)

    train, test = train_test_split(ratings, train_size=0.8, seed=FLAGS.seed)

    train_size = len(train)
    train = train.shuffle(train_size).batch(FLAGS.train_batch_size)
    test = test.batch(FLAGS.eval_batch_size)

    return train, test, vocabularies
Пример #7
0
 def __init_dataset__(self, type):
     if type == self.triplet:
         if self.train_ds == None or self.is_triplet_dataset == False:
             print(">>>> Init triplet dataset...")
             # batch_size = int(self.batch_size / 4 * 1.5)
             batch_size = self.batch_size // 4
             tt = data.Triplet_dataset(self.data_path,
                                       batch_size=batch_size,
                                       random_status=self.random_status,
                                       random_crop=(100, 100, 3))
             self.train_ds = tt.train_dataset
             self.classes = self.train_ds.element_spec[-1].shape[-1]
             self.is_triplet_dataset = True
     else:
         if self.train_ds == None or self.is_triplet_dataset == True:
             print(">>>> Init softmax dataset...")
             self.train_ds = data.prepare_dataset(
                 self.data_path,
                 batch_size=self.batch_size,
                 random_status=self.random_status,
                 random_crop=(100, 100, 3))
             self.classes = self.train_ds.element_spec[-1].shape[-1]
             self.is_triplet_dataset = False
Пример #8
0
from utils.evaluation import accuracy

df = pd.read_csv(anno_file, header=None)
filenames = df.pop(0)
filenames = DATA_DIR + filenames.astype(str)
DATASET_SIZE = len(filenames)
train_size = int((1 - val_split) * DATASET_SIZE)
val_size = int(val_split * DATASET_SIZE)
dataset = tf.data.Dataset.from_tensor_slices((filenames.values, df.values))

dataset = dataset.shuffle(1000)

train_dataset = dataset.take(train_size)
val_dataset = dataset.take(val_size)

train_dataset = prepare_dataset(train_dataset, BATCH_SIZE, train=True)
val_dataset = prepare_dataset(val_dataset, BATCH_SIZE)

hourglass = StackedHourglass(n_stacks=n_stacks,
                             n_blocks=n_blocks,
                             n_keypoints=n_keypoints)

hourglass.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=base_lr,
                                                        centered=True),
                  loss=hg_loss,
                  metrics=[accuracy])

hourglass.build(input_shape=(BATCH_SIZE, *IMAGE_SIZE, 3))
hourglass.summary()

steps_per_epoch = train_size // BATCH_SIZE
Пример #9
0
    25: 'voice',
}

columns.update(morph_features)
labels += morph_features.values()

if params['lemmatization'] and params['char_embeddings']:
    columns.update({2: 'lemma'})

data = (name, columns)

embeddings_path = 'wordvec/' + name + '.vec'
characters_path = 'characters/' + name + '.chars'

# Prepare dataset
pickle_dump = prepare_dataset(embeddings_path, data, characters_path)

# Load embeddings and dataset
embeddings, _, dataset = load_dataset(pickle_dump)

mappings = dataset['mappings']

model = MultitaskLSTM(name, embeddings, (
    dataset,
    labels,
), params=params)

if not os.path.isdir(dump_path):
    os.mkdir(dump_path)

model.train(args.nb_epochs)
Пример #10
0
update_tag_scheme(train_sentences, parameters['tag_scheme'])
update_tag_scheme(dev_sentences, parameters['tag_scheme'])
update_tag_scheme(test_sentences, parameters['tag_scheme'])

print(train_sentences[0])
print(dev_sentences[0])
print(test_sentences[0])

dico_words, word_to_id, id_to_word = to_word_mapping(train_sentences,
                                                     parameters['lower'])
dico_chars, char_to_id, id_to_char = to_char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = to_tag_mapping(train_sentences)

# ##### Preparing final dataset

train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                             tag_to_id, parameters['lower'])
dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id,
                           parameters['lower'])
test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id,
                            parameters['lower'])
print("{} / {} / {} sentences in train / dev / test.".format(
    len(train_data), len(dev_data), len(test_data)))

# ##### Load Word Embeddings

all_word_embeds = {}
for i, line in enumerate(
        codecs.open(parameters['embedding_path'], 'r', 'utf-8')):
    s = line.strip().split()
    if len(s) == parameters['word_dim'] + 1:
        all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
dpi = 96
size = 700

train_transformations = transforms.Compose([
    #transforms.RandomHorizontalFlip(),
    #transforms.RandomCrop(32,padding=4),
    transforms.RandomResizedCrop(2000),  #从原图像随机切割一张(size, size)的图像
    transforms.ToTensor(),
    #transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
])

data_dir = './image/site1/sample'
train_dir = './data/train/'
test_dir = './data/test/'
rate = 0.1  #自定义抽取图片的比例,比方说100张抽10张,那就是0.1
prepare_dataset(data_dir, train_dir, test_dir, rate)
train_dataset = dset.ImageFolder(root=train_dir,
                                 transform=train_transformations)
test_dataset = dset.ImageFolder(root=test_dir, transform=train_transformations)
print(train_dataset.class_to_idx)
print(train_dataset.__len__)
print(train_dataset[1000][0].size())
print(train_dataset[900][1])  # 得到的是类别4,即'FALSE'
print(train_dataset.classes[train_dataset[900][1]])
"""# plot one example
plt.subplot(1, 2, 1)
img = transforms.ToPILImage()(dataset[0][0])
plt.imshow(img)
plt.title('Class:'+dataset.classes[0])
plt.subplot(1, 2, 2)
img2 = transforms.ToPILImage()(dataset[201][0])