def __init_dataset__(self, type, emb_loss_names): init_as_triplet = self.triplet in emb_loss_names or type == self.triplet if self.train_ds is not None and init_as_triplet == self.is_triplet_dataset and self.is_distill_ds == False: return dataset_params = { "data_path": self.data_path, "batch_size": self.batch_size, "random_status": self.random_status, "image_per_class": self.image_per_class, "teacher_model_interf": self.teacher_model_interf, } if init_as_triplet: print(">>>> Init triplet dataset...") if self.data_path.endswith(".tfrecord"): print( ">>>> Combining tfrecord dataset with triplet is NOT recommended." ) self.train_ds, self.steps_per_epoch = data.prepare_distill_dataset_tfrecord( **dataset_params) else: aa = data.Triplet_dataset(**dataset_params) self.train_ds, self.steps_per_epoch = aa.ds, aa.steps_per_epoch self.is_triplet_dataset = True else: print(">>>> Init softmax dataset...") if self.data_path.endswith(".tfrecord"): self.train_ds, self.steps_per_epoch = data.prepare_distill_dataset_tfrecord( **dataset_params) else: self.train_ds, self.steps_per_epoch = data.prepare_dataset( **dataset_params) self.is_triplet_dataset = False if tf.distribute.has_strategy(): self.train_ds = self.train_ds.with_options(self.data_options) label_spec = self.train_ds.element_spec[-1] if isinstance(label_spec, tuple): # dataset with embedding values self.is_distill_ds = True self.teacher_emb_size = label_spec[0].shape[-1] self.classes = label_spec[1].shape[-1] if type == self.distill: # Loss is distill type: [label * n, embedding] self.train_ds = self.train_ds.map( lambda xx, yy: (xx, yy[1:] * len(emb_loss_names) + yy[:1])) elif (self.distill in emb_loss_names and len(emb_loss_names) != 1 ) or (self.distill not in emb_loss_names and len(emb_loss_names) != 0): # Will attach distill loss as embedding loss, and there are other embedding losses: [embedding, label * n] label_data_len = len( emb_loss_names) if self.distill in emb_loss_names else len( emb_loss_names) + 1 self.train_ds = self.train_ds.map( lambda xx, yy: (xx, yy[:1] + yy[1:] * label_data_len)) else: self.is_distill_ds = False self.classes = label_spec.shape[-1]
def __init_dataset_softmax__(self): if self.train_ds == None or self.is_triplet_dataset == True: print(">>>> Init softmax dataset...") self.train_ds = data.prepare_dataset( self.data_path, batch_size=self.batch_size, random_status=self.random_status, random_crop=(100, 100, 3)) self.classes = self.train_ds.element_spec[-1].shape[-1] self.is_triplet_dataset = False
def __init_dataset_softmax__(self): if self.train_ds == None or self.is_triplet_dataset == True: print(">>>> Init softmax dataset...") self.train_ds = data.prepare_dataset( self.data_path, batch_size=self.batch_size, random_status=self.random_status, random_crop=(100, 100, 3), cache=self.dataset_cache) label_spec = self.train_ds.element_spec[-1] if isinstance(label_spec, tuple): # dataset with embedding values self.is_distiller = True self.classes = label_spec[0].shape[-1] else: self.is_distiller = False self.classes = label_spec.shape[-1] self.is_triplet_dataset = False
def __init_dataset__(self, type): if type == self.triplet: if self.train_ds == None or self.is_triplet_dataset == False: print(">>>> Init triplet dataset...") # batch_size = int(self.batch_size / 4 * 1.5) batch_size = self.batch_size // 4 tt = data.Triplet_dataset(self.data_path, batch_size=batch_size, random_status=self.random_status) self.train_ds, self.steps_per_epoch = tt.train_dataset, tt.steps_per_epoch self.is_triplet_dataset = True else: if self.train_ds == None or self.is_triplet_dataset == True: print(">>>> Init softmax dataset...") self.train_ds, self.steps_per_epoch, self.classes = data.prepare_dataset( self.data_path, batch_size=self.batch_size, random_status=self.random_status) self.is_triplet_dataset = False
def prepare_forecast_data(self, data_path='./data/forecast/'): # Download and store all sites data start_date, end_date = data.get_data_start_and_end_date('forecast') #Clean the old files from firectory data.clean_directory(data_path) #Download for training model for site in self.site_ids: data.download_site_data(site, start_date, end_date, data_path) forecast_df = data.prepare_dataset(data_path) # print(forecast_df.head()) # FORECAST DATASET FIELDS NEED TO REARRANGE forecast_df = forecast_df[['NO', 'PM10', 'PM2.5', 'CO']] # print(forecast_df.head()) # print( forecast_df.shape ) forecast_period = (datetime.strptime( np.datetime_as_string(forecast_df.index[-1:].values[0], unit='s'), '%Y-%m-%dT%H:%M:%S') + timedelta(minutes=30)).strftime("%d-%b-%Y %H:%M:%S") # print('Forecast period: {}'.format(forecast_period)) forecast_dataset = data.split_forecast_dataset(forecast_df.values) return forecast_period, forecast_dataset
def load_data(features_dict): dataset = f'movielens/{FLAGS.dataset}-ratings' ratings = tfds.load(dataset, split='train', data_dir=FLAGS.data_dir) # Prepare for binarization ratings.filter(lambda x: x['user_rating'] != 3.0) ratings = prepare_dataset(ratings, features_dict) # Cache for efficiency ratings = ratings.cache(tempfile.NamedTemporaryFile().name) features = features_by_type(features_dict) categorical_features = features['string'] + features['integer'] vocabularies = get_vocabularies(ratings, categorical_features) train, test = train_test_split(ratings, train_size=0.8, seed=FLAGS.seed) train_size = len(train) train = train.shuffle(train_size).batch(FLAGS.train_batch_size) test = test.batch(FLAGS.eval_batch_size) return train, test, vocabularies
def __init_dataset__(self, type): if type == self.triplet: if self.train_ds == None or self.is_triplet_dataset == False: print(">>>> Init triplet dataset...") # batch_size = int(self.batch_size / 4 * 1.5) batch_size = self.batch_size // 4 tt = data.Triplet_dataset(self.data_path, batch_size=batch_size, random_status=self.random_status, random_crop=(100, 100, 3)) self.train_ds = tt.train_dataset self.classes = self.train_ds.element_spec[-1].shape[-1] self.is_triplet_dataset = True else: if self.train_ds == None or self.is_triplet_dataset == True: print(">>>> Init softmax dataset...") self.train_ds = data.prepare_dataset( self.data_path, batch_size=self.batch_size, random_status=self.random_status, random_crop=(100, 100, 3)) self.classes = self.train_ds.element_spec[-1].shape[-1] self.is_triplet_dataset = False
from utils.evaluation import accuracy df = pd.read_csv(anno_file, header=None) filenames = df.pop(0) filenames = DATA_DIR + filenames.astype(str) DATASET_SIZE = len(filenames) train_size = int((1 - val_split) * DATASET_SIZE) val_size = int(val_split * DATASET_SIZE) dataset = tf.data.Dataset.from_tensor_slices((filenames.values, df.values)) dataset = dataset.shuffle(1000) train_dataset = dataset.take(train_size) val_dataset = dataset.take(val_size) train_dataset = prepare_dataset(train_dataset, BATCH_SIZE, train=True) val_dataset = prepare_dataset(val_dataset, BATCH_SIZE) hourglass = StackedHourglass(n_stacks=n_stacks, n_blocks=n_blocks, n_keypoints=n_keypoints) hourglass.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=base_lr, centered=True), loss=hg_loss, metrics=[accuracy]) hourglass.build(input_shape=(BATCH_SIZE, *IMAGE_SIZE, 3)) hourglass.summary() steps_per_epoch = train_size // BATCH_SIZE
25: 'voice', } columns.update(morph_features) labels += morph_features.values() if params['lemmatization'] and params['char_embeddings']: columns.update({2: 'lemma'}) data = (name, columns) embeddings_path = 'wordvec/' + name + '.vec' characters_path = 'characters/' + name + '.chars' # Prepare dataset pickle_dump = prepare_dataset(embeddings_path, data, characters_path) # Load embeddings and dataset embeddings, _, dataset = load_dataset(pickle_dump) mappings = dataset['mappings'] model = MultitaskLSTM(name, embeddings, ( dataset, labels, ), params=params) if not os.path.isdir(dump_path): os.mkdir(dump_path) model.train(args.nb_epochs)
update_tag_scheme(train_sentences, parameters['tag_scheme']) update_tag_scheme(dev_sentences, parameters['tag_scheme']) update_tag_scheme(test_sentences, parameters['tag_scheme']) print(train_sentences[0]) print(dev_sentences[0]) print(test_sentences[0]) dico_words, word_to_id, id_to_word = to_word_mapping(train_sentences, parameters['lower']) dico_chars, char_to_id, id_to_char = to_char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = to_tag_mapping(train_sentences) # ##### Preparing final dataset train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, parameters['lower']) print("{} / {} / {} sentences in train / dev / test.".format( len(train_data), len(dev_data), len(test_data))) # ##### Load Word Embeddings all_word_embeds = {} for i, line in enumerate( codecs.open(parameters['embedding_path'], 'r', 'utf-8')): s = line.strip().split() if len(s) == parameters['word_dim'] + 1: all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])
dpi = 96 size = 700 train_transformations = transforms.Compose([ #transforms.RandomHorizontalFlip(), #transforms.RandomCrop(32,padding=4), transforms.RandomResizedCrop(2000), #从原图像随机切割一张(size, size)的图像 transforms.ToTensor(), #transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)) ]) data_dir = './image/site1/sample' train_dir = './data/train/' test_dir = './data/test/' rate = 0.1 #自定义抽取图片的比例,比方说100张抽10张,那就是0.1 prepare_dataset(data_dir, train_dir, test_dir, rate) train_dataset = dset.ImageFolder(root=train_dir, transform=train_transformations) test_dataset = dset.ImageFolder(root=test_dir, transform=train_transformations) print(train_dataset.class_to_idx) print(train_dataset.__len__) print(train_dataset[1000][0].size()) print(train_dataset[900][1]) # 得到的是类别4,即'FALSE' print(train_dataset.classes[train_dataset[900][1]]) """# plot one example plt.subplot(1, 2, 1) img = transforms.ToPILImage()(dataset[0][0]) plt.imshow(img) plt.title('Class:'+dataset.classes[0]) plt.subplot(1, 2, 2) img2 = transforms.ToPILImage()(dataset[201][0])