def main(args): steps_per_epoch = NUM_IMAGES // args.batch_size training_steps = (NUM_IMAGES * args.num_epochs) // args.batch_size train_imsize = nfnet_params[args.variant]["train_imsize"] test_imsize = nfnet_params[args.variant]["test_imsize"] aug_base_name = "cutmix_mixup_randaugment" augment_name = f"{aug_base_name}_{nfnet_params[args.variant]['RA_level']}" max_lr = 0.1 * args.batch_size / 256 eval_preproc = "resize_crop_32" model = NFNet( num_classes=1000, variant=args.variant, label_smoothing=args.label_smoothing, ema_decay=args.ema_decay, ) model.build((1, 224, 224, 3)) lr_decayed_fn = tf.keras.experimental.CosineDecay( initial_learning_rate=max_lr, decay_steps=training_steps - 5 * steps_per_epoch, ) lr_schedule = WarmUp( initial_learning_rate=max_lr, decay_schedule_fn=lr_decayed_fn, warmup_steps=5 * steps_per_epoch, ) optimizer = tfa.optimizers.SGDW(learning_rate=lr_schedule, weight_decay=2e-5, momentum=0.9) model.compile( optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[ tf.keras.metrics.SparseCategoricalAccuracy(name="top_1_acc"), tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top_5_acc"), ], ) ds_train = load( Split(2), is_training=True, batch_dims=(args.batch_size, ), # dtype=tf.bfloat16, image_size=(train_imsize, train_imsize), augment_name=augment_name, ) # ds_valid = load(Split(3), is_training=False, batch_dims=(256, ), augment_name="cutmix") ds_test = load( Split(4), is_training=False, batch_dims=(25, ), # dtype=tf.bfloat16, image_size=(test_imsize, test_imsize), eval_preproc=eval_preproc, ) model.fit( ds_train, validation_data=ds_test, epochs=args.num_epochs, steps_per_epoch=steps_per_epoch, callbacks=[tf.keras.callbacks.TensorBoard()], )
def test_read_large_dataset(self): dataset = Dataset('/home/karthik/PycharmProjects/cmps242/project/yelp_dataset_challenge_academic_dataset', 8, 1000, -1) dataset_stats = dataset.load() term_freq_prod_inv_doc_freq = dataset_stats.top_term_freq_prod_inv_doc_freq(50) for term, freq in term_freq_prod_inv_doc_freq.iteritems(): print('term:%s tf-idf:%s idf:%s' %(term, str(freq), str(dataset_stats.inverse_doc_freq(term))))
def run_training(args): out_dir = pathlib.Path(args.directory) sentences = dataset.load(args.source) if args.epoch is not None: start = args.epoch + 1 storage = load(out_dir, args.epoch) sentences = itertools.islice(sentences, start, None) else: start = 0 storage = init(args) if (out_dir / meta_name).exists(): if input('Overwrite? [y/N]: ').strip().lower() != 'y': exit(1) with (out_dir / meta_name).open('wb') as f: np.save(f, [storage]) batchsize = 5000 for i, sentence in enumerate(sentences, start): if i % batchsize == 0: print() serializers.save_npz(str(out_dir / model_name(i)), storage.model) serializers.save_npz(str(out_dir / optimizer_name(i)), storage.optimizer) else: print(util.progress('batch {}'.format(i // batchsize), (i % batchsize) / batchsize, 100), end='') train(storage.model, storage.optimizer, generate_data(sentence), generate_label(sentence), generate_attr(sentence, storage.mappings))
def main(args): steps_per_epoch = NUM_IMAGES // args.batch_size test_imsize = nfnet_params[args.variant]["test_imsize"] eval_preproc = "resize_crop_32" model = NFNet( num_classes=1000, variant=args.variant, ) model.build((1, test_imsize, test_imsize, 3)) model.compile( loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[ tf.keras.metrics.SparseCategoricalAccuracy(name="top_1_acc"), tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top_5_acc"), ], ) ds_test = load( Split(4), is_training=False, batch_dims=(args.batch_size, ), # dtype=tf.bfloat16, image_size=(test_imsize, test_imsize), eval_preproc=eval_preproc, ) model.load_weights(f"{args.variant}_NFNet/{args.variant}_NFNet") model.evaluate(ds_test, steps=steps_per_epoch)
def download_poster_links(save_every=20, pbar=True): ids = dataset.load('links.csv', usecols=[ 'movieId', 'imdbId'], index_col='movieId') ids = _only_unfinished(ids) posters = pd.DataFrame() posters.index.name = 'movieId' iterator = enumerate(ids.iterrows()) if pbar: iterator = tqdm(iterator, total=len(ids.index)) for i, row in iterator: movieId, (imdbId,) = row try: link = movieposters.get_poster(id=imdbId) except (movieposters.MovieNotFound, movieposters.PosterNotFound): link = np.nan posters.at[movieId, 'poster'] = link if (i + 1) % save_every == 0: save_poster_links(posters) # once saved, not clearing the df will result in dupes saved later posters = posters.iloc[0:0] if (i + 1) % save_every != 0: # didn't save the last batch save_poster_links(posters)
def main(): with tf.Session() as session: network = network_mod.restore(session) dataset = dataset_mod.load() cost, accuracy = run_test(session, network, dataset) print(f"Test Cost: {cost:0.3f} | Test Acc: {100*accuracy:3.1f}")
def run_simulation(num_rounds: int, num_clients: int, fraction_fit: float): """Start a FL simulation.""" # This will hold all the processes which we are going to create processes = [] # Start the server server_process = Process( target=start_server, args=(num_rounds, num_clients, fraction_fit) ) server_process.start() processes.append(server_process) # Optionally block the script here for a second or two so the server has time to start time.sleep(2) # Load the dataset partitions partitions = dataset.load(num_clients) # Start all the clients for partition in partitions: client_process = Process(target=start_client, args=(partition,)) client_process.start() processes.append(client_process) # Block until all processes are finished for p in processes: p.join()
def test_iris(self): x, y = dataset.load('iris') self.assertEqual(x.shape, (150, 4)) self.assertEqual(y.shape, (150, )) self.assertEqual(np.min(x), 0) self.assertEqual(np.max(x), 1) self.assertEqual(np.min(y), 0) self.assertEqual(np.max(y), 2)
def test_load(self): x, y = dataset.load() self.assertEqual(x.shape, (70000, 28, 28, 1)) self.assertEqual(y.shape, (70000, )) self.assertEqual(np.min(x), 0) self.assertEqual(np.max(x), 1) self.assertEqual(np.min(y), 0) self.assertEqual(np.max(y), 9)
def get_title(movieId): global _TITLES_BY_ID # noqa try: return _TITLES_BY_ID.at[movieId, 'title'] # noqa except NameError: _TITLES_BY_ID = dataset.load( 'movies.csv', index_col='movieId', usecols=['movieId', 'title']) return _TITLES_BY_ID.at[movieId, 'title']
def get_model(): if os.path.isfile(model_path): model = load_model(model_path) else: data = dataset.load() model = create_net(data) return model
def dump_dset(ps): ps.max_val = 10000 ps.num_samples = 1000 # 100000 ps.num_shards = 10 fs = [f for f in qd.dump(ps)] ps.dim_batch = 100 for i, _ in enumerate(qd.load(ps, fs).map(adapter)): pass print(f'dumped {i} batches of {ps.dim_batch} samples each') return fs
def submit(predicted, filename='submission.csv'): U.log('Converting predictions into submission file.') if U.on_kaggle(): U.log('Running on Kaggle.') sample = pd.read_csv( '/kaggle/input/data-science-bowl-2019/sample_submission.csv') else: U.log('Running locally.') [sample] = load(Subset.Sample) sample['accuracy_group'] = predicted.astype(int) sample.to_csv(filename, index=False) return filename
def load_ev(config=setup_config(), args=args): dl_ev = torch.utils.data.DataLoader(dataset.load( name=args.dataset, root=config['dataset'][args.dataset]['root'], classes=config['dataset'][args.dataset]['classes']['eval'], transform=dataset.utils.make_transform( **config['transform_parameters'], is_train=False)), batch_size=args.sz_batch, shuffle=False, num_workers=args.nb_workers, pin_memory=True) return dl_ev
def start_game(mode): global game_mode, game_state, moves, field, winner, features, labels game_mode = mode if game_mode != training: game_state = game_inprocess if game_mode == pvai: features, labels = load() #Reloading data else: game_state = game_intraining field = [[0, 0, 0], [0, 0, 0], [0, 0, 0]] moves = 0 winner = ''
def load(config, options=None): label = None description = None dataset = None model = None training = None if options is None: options = {} if 'base' not in options or options['base'] is None: options['base'] = os.getcwd() if not options['base'].startswith('/'): options['base'] = os.path.join(os.getcwd(), options['base']) if 'label' in config: label = config['label'] if 'description' in config: description = config['description'] if 'model' in config: model = model_utils.load(config['model'], options) if 'training' in config: training = training_utils.load(config['training'], options) if 'dataset' in config: dataset = dataset_utils.load(config['dataset'], model, training, options) if 'weightsHdf5' in config: weights_hdf5 = config['weightsHdf5'] if not weights_hdf5.startswith('/'): weights_hdf5 = os.path.join(options['base'], weights_hdf5) project = Project( label=label, description=description, weights_hdf5=weights_hdf5, dataset=dataset, model=model, training=training, options=options) if 'loadWeights' in options and options['loadWeights']: project.model.load_weights_hdf5(project.weights_hdf5) return project
def main(generators, image_file): input_image = load(image_file) input_image = tf.image.resize(input_image, (768, 1024)) gen_image = recuperacion(generators, image_file) plt.subplot(1,2,1); plt.imshow(input_image) plt.subplot(1,2,2); plt.imshow(gen_image) plt.show() if not os.path.exists('imagenes_generadas'): os.mkdir('imagenes_generadas') # Creamos el directorio si es que no existe. gen_image = tf.cast(gen_image * 255, tf.uint8) gen_image = tf.image.encode_jpeg(gen_image) tf.io.write_file(f'imagenes_generadas/imagen_generada_{str(time.time())}.jpg', gen_image)
def create_random_submission( test_csv: str = "data/test.csv", output_file: str = "submission.csv", ): dataset = load(test_csv, preprocess=False) try: ids = dataset["id"] except: try: ids = dataset["comment_id"] except: ids = dataset["comment_text"] all_predictions = [random.randint(0, 1) for _ in range(len(ids))] df = pd.DataFrame(columns=["id", "prediction"], data=zip(*[ids, all_predictions])) df.to_csv(output_file)
def run_test(args): out_dir = pathlib.Path(args.directory) sentences = dataset.load(args.source) storage = load(out_dir, args.epoch) y_sum = None zs_sum = None for i, sentence in enumerate(itertools.islice(sentences, 100)): y_mat, zs_mat = test( storage.model, generate_data(sentence), generate_label(sentence), generate_attr( sentence, storage.mappings ) ) if i == 0: y_sum = y_mat zs_sum = zs_mat else: y_sum += y_mat for z_sum, z_mat in zip(zs_sum, zs_mat): z_sum += z_mat prec, rec, f = statistic.f_measure(y_sum) print('== segmentation ==') print('precision:', prec) print('recall:', rec) print('F-measure:', f) for k, z_sum in zip(storage.mappings._fields, zs_sum): prec, rec, f = statistic.f_measure_micro_average(z_sum) print('== {} =='.format(k)) print('precision:', prec) print('recall:', rec) print('F-measure:', f) print('expect:', '/'.join( info.surface_form for info in sentence) ) print('actual:', '/'.join( y for (y, zs) in generate( storage.model, generate_data(sentence) ) ))
def run_training(args): out_dir = pathlib.Path(args.directory) sentences = dataset.load(args.source) if args.epoch is not None: start = args.epoch + 1 storage = load(out_dir, args.epoch) sentences = itertools.islice(sentences, start, None) else: start = 0 storage = init(args) if (out_dir/meta_name).exists(): if input('Overwrite? [y/N]: ').strip().lower() != 'y': exit(1) with (out_dir/meta_name).open('wb') as f: np.save(f, [storage]) batchsize = 5000 for i, sentence in enumerate(sentences, start): if i % batchsize == 0: print() serializers.save_npz( str(out_dir/model_name(i)), storage.model ) serializers.save_npz( str(out_dir/optimizer_name(i)), storage.optimizer ) else: print( util.progress( 'batch {}'.format(i // batchsize), (i % batchsize) / batchsize, 100), end='' ) train(storage.model, storage.optimizer, generate_data(sentence), generate_label(sentence), generate_attr( sentence, storage.mappings ) )
def main(): cifar = dataset.load(10000) X, Y = cifar.data, cifar.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logistic = linear_model.LogisticRegression(C=6000.0) rbm = BernoulliRBM(n_components=100, learning_rate=0.025, batch_size=10, n_iter=100, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=100.0) logistic_classifier.fit(X_train, Y_train) Y_predicted_rbm = classifier.predict(X_test) Y_predicted_raw = logistic_classifier.predict(X_test) # Evaluate classifiers print() print("Logistic regression using RBM features:\n%s\n" % ( metrics.classification_report( Y_test, Y_predicted_rbm, target_names=cifar.target_names))) print("Logistic regression using raw pixel features:\n%s\n" % ( metrics.classification_report( Y_test, Y_predicted_raw, target_names=cifar.target_names))) print("Confusion matrix RBM features:\n%s" % metrics.confusion_matrix(Y_test, Y_predicted_rbm)) print("Confusion matrix raw pixel features:\n%s" % metrics.confusion_matrix(Y_test, Y_predicted_raw)) # Plot RBM features plot(rbm, 100)
def generate_samples(model_path, rows, cols, channels, sample_size): img_size = rows * cols # Load model json_file = open(model_path + ".json", 'r') json_model = json_file.read() json_file.close() model = model_from_json(json_model) model.load_weights(model_path + ".h5") # Load dataset (_, sketches) = dataset.load("sketches") sketches = sketches[0:sample_size] # Resize dataset sketches = sketches / 127.5 - 1. sketches = np.expand_dims(sketches, axis=3) sketches = sketches.reshape(sketches.shape[0], img_size) # Generate samples gen_imgs = model.predict(sketches) # Rescale images 0 - 1 gen_imgs = 0.5 * gen_imgs + 0.5 # Reshape images sketches = sketches.reshape((sample_size, rows, cols, channels)) gen_imgs = gen_imgs.reshape((sample_size, rows, cols, channels)) # Make directories if not os.path.exists("model_results/input"): os.makedirs("model_results/input") if not os.path.exists("model_results/output"): os.makedirs("model_results/output") # Save images for i in range(len(sketches)): img = image.array_to_img(sketches[i]) img.save('model_results/input/' + str(i) + '.png') for i in range(len(gen_imgs)): img = image.array_to_img(gen_imgs[i]) img.save('model_results/output/' + str(i) + '.png')
def main(): cifar = dataset.load(10000) X, Y = cifar.data, cifar.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) logistic = linear_model.LogisticRegression(C=6000.0) rbm = BernoulliRBM(n_components=100, learning_rate=0.025, batch_size=10, n_iter=100, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) # Training RBM-Logistic Pipeline classifier.fit(X_train, Y_train) # Training Logistic regression logistic_classifier = linear_model.LogisticRegression(C=100.0) logistic_classifier.fit(X_train, Y_train) Y_predicted_rbm = classifier.predict(X_test) Y_predicted_raw = logistic_classifier.predict(X_test) # Evaluate classifiers print() print("Logistic regression using RBM features:\n%s\n" % (metrics.classification_report( Y_test, Y_predicted_rbm, target_names=cifar.target_names))) print("Logistic regression using raw pixel features:\n%s\n" % (metrics.classification_report( Y_test, Y_predicted_raw, target_names=cifar.target_names))) print("Confusion matrix RBM features:\n%s" % metrics.confusion_matrix(Y_test, Y_predicted_rbm)) print("Confusion matrix raw pixel features:\n%s" % metrics.confusion_matrix(Y_test, Y_predicted_raw)) # Plot RBM features plot(rbm, 100)
def train(name, resume): # paths log_path = "logs/{}.json".format(name) out_path = "snapshots/" + name + ".{epoch:06d}.h5" echo('log path', log_path) echo('out path', out_path) lib.log.info(log_path, {'_commandline': {'name': name, 'resume': resume}}) # init echo('train', (name, resume)) session = tf.Session('') K.set_session(session) K.set_learning_phase(1) # dataset echo('dataset loading...') (x_train, y_train), (x_test, y_test) = dataset.load() # model building echo('model building...') model = lib.model.build() model.summary() if resume: echo('Resume Learning from {}'.format(resume)) model.load_weights(resume, by_name=True) # training echo('start learning...') callbacks = [ lib.log.JsonLog(log_path), keras.callbacks.ModelCheckpoint(out_path, monitor='val_loss', save_weights_only=True) ] model.fit(x_train, y_train, batch_size=30, epochs=10, callbacks=callbacks, validation_data=(x_test, y_test))
def main(): ds = dataset.load() dates = dataset.get_dates_array() import jordicolomer_autoregressive for m in range(1,2): params = {'m':m, 'dates':dates} print 'jordicolomer_autoregressive',m,evaluate_all(ds, jordicolomer_autoregressive.predict, params),'\n' #exit(0) import jordicolomer_average for n in range(1,11): params = {'n':n, 'dates':dates} print 'jordicolomer_average',n,evaluate_all(ds, jordicolomer_average.predict, params),'\n' import jordicolomer_averageweekly for n in range(1,11): params = {'n':n, 'dates':dates} print 'jordicolomer_averageweekly',n,evaluate_all(ds, jordicolomer_averageweekly.predict, params),'\n' import jordicolomer_averageWeeklyWithTrend for m in range(1,20): params = {'m':m, 'dates':dates} print 'jordicolomer_averageWeeklyWithTrend',m,evaluate_all(ds, jordicolomer_averageWeeklyWithTrend.predict, params),'\n'
def test(theta): print 'loading data...' _, _, dataTe = dataset.load(name='mnist.pkl.gz') print 'building the graph...' # fprop x = T.matrix('x', 'float32') F = models.create_mlp(x, theta) # zero-one loss y = T.ivector('y') ell = loss.create_zeroone(F, y) # all in one graph f_graph = function( inputs=[], outputs=ell, givens={x: dataTe[0], y: dataTe[1]} ) print 'fire the graph...' er = f_graph() print 'error rate = %5.4f' % (er,)
def run(hidden, layer, dropout, learning_rate, iteration, save, train=None, test=None): if train: dataset_id = train.split('/')[-1].split('.')[0] pre_processing = PreProcessing(open(train, 'r'), dataset_id) dataset = process(pre_processing) encoder_embeddings = WordEmbedding(source=dataset.pairs) decoder_embeddings = WordEmbedding(source=dataset.pairs) encoder = EncoderRNN(encoder_embeddings, hidden, layer).to(settings.device) decoder = DecoderRNN(hidden, decoder_embeddings, dropout, layer).to(settings.device) model = Model( encoder=encoder, decoder=decoder, learning_rate=learning_rate, ) model.summary() model.train(dataset, n_iter=iteration, save_every=save) if test: dataset = load(test) model = Model.load(test) while True: decoded_words = model.evaluate(str(input("> ")), dataset) print(' '.join(decoded_words))
def compute_qini(parameters): X_original, t_original, y_original = dataset.load(parameters['dataset_id']) X, t, y = dataset.shuffled(X_original, t_original, y_original, seed=parameters['shuffle_seed']) ((X_train, t_train, y_train), (X_test, t_test, y_test)) = dataset.train_test_split(X, t, y, train_proportion=2 / 3) rfc = RandomForestClassifier( n_estimators=parameters['n_estimators'], criterion=parameters['criterion'], max_depth=parameters['max_depth'], min_samples_split=parameters['min_samples_split'], min_samples_leaf=parameters['min_samples_leaf']) rfc.fit(X_train, y_train, t_train) uplift_test = rfc.predict_uplift(X_test) return qini_q(y_test, uplift_test, t_test)
def load_data(fn, data_path, spit_date): ratings_ = dataset.load(fn, path=data_path, delim=',') ratings = dataset.parse_timestamp(ratings_) # rename ratings columns ratings = ratings.rename( columns={ "userId": "user_id", "movieId": "item_id", "rating": "rating", "datetime": "datetime" }) # Movielese data stats print("ratings columns: {}".format(ratings.columns)) print("No of rows in ratings df: {}".format(ratings.shape[0])) print("Min datetime: {}, max datetime: {}".format( ratings["datetime"].min(), ratings["datetime"].max())) split_time = pd.datetime.strptime(spit_date, '%Y-%m-%d %H:%M:%S.%f') # split train/test folds train_df, test_df = dataset.split(ratings, split_time) print("Size of train dataset: {} & size of test dataset: {}".format( train_df.shape[0], test_df.shape[0])) print(ratings.head(5)) return train_df, test_df
def run_test(args): out_dir = pathlib.Path(args.directory) sentences = dataset.load(args.source) storage = load(out_dir, args.epoch) y_sum = None zs_sum = None for i, sentence in enumerate(itertools.islice(sentences, 100)): y_mat, zs_mat = test(storage.model, generate_data(sentence), generate_label(sentence), generate_attr(sentence, storage.mappings)) if i == 0: y_sum = y_mat zs_sum = zs_mat else: y_sum += y_mat for z_sum, z_mat in zip(zs_sum, zs_mat): z_sum += z_mat prec, rec, f = statistic.f_measure(y_sum) print('== segmentation ==') print('precision:', prec) print('recall:', rec) print('F-measure:', f) for k, z_sum in zip(storage.mappings._fields, zs_sum): prec, rec, f = statistic.f_measure_micro_average(z_sum) print('== {} =='.format(k)) print('precision:', prec) print('recall:', rec) print('F-measure:', f) print('expect:', '/'.join(info.surface_form for info in sentence)) print( 'actual:', '/'.join( y for (y, zs) in generate(storage.model, generate_data(sentence))))
def main(): ds = dataset.load() dates = dataset.get_dates_array() import jordicolomer_autoregressive for m in range(1, 2): params = {'m': m, 'dates': dates} print 'jordicolomer_autoregressive', m, evaluate_all( ds, jordicolomer_autoregressive.predict, params), '\n' #exit(0) import jordicolomer_average for n in range(1, 11): params = {'n': n, 'dates': dates} print 'jordicolomer_average', n, evaluate_all( ds, jordicolomer_average.predict, params), '\n' import jordicolomer_averageweekly for n in range(1, 11): params = {'n': n, 'dates': dates} print 'jordicolomer_averageweekly', n, evaluate_all( ds, jordicolomer_averageweekly.predict, params), '\n' import jordicolomer_averageWeeklyWithTrend for m in range(1, 20): params = {'m': m, 'dates': dates} print 'jordicolomer_averageWeeklyWithTrend', m, evaluate_all( ds, jordicolomer_averageWeeklyWithTrend.predict, params), '\n'
def recuperacion(generators, image_file): """ Genera una imagen de un paisaje recuperado luego de un incendio. El generador transformara la imagen de a trozos. Args: - generators: Una lista de modelos generadores que tomara una imagen y generara su versionrecuperada. - image_file: El directorio de la imagen que desea tranformar. Returns: La imagen resultante almacenada en un `Tensor` de valores 0-1 de tipo float32. """ img_prueba = load(image_file) img_prueba = tf.image.resize(img_prueba, (768, 1024)) resultado = np.zeros((768, 1024, 3)) x, y, _ = img_prueba.shape fila = 0 columna = 0 intervalo_y = (y - 256) // 5 intervalo_x = (x - 256) // 3 while 256 + fila < x: while 256 + columna < y: #graficar(img_prueba/255, img_prueba[: , 0+columna:256+columna]/255 ) part = img_prueba[fila:256 + fila, columna:256 + columna] part = (part * 2) - 1 part_gen = generar_imagen(generators, part[None, ...]) part = resultado[fila:256 + fila, columna:256 + columna] part[part == 0] = part_gen[part == 0] part[:] = np.mean([part, part_gen], axis=0) columna += intervalo_y columna = 0 fila += intervalo_x resultado = (resultado + 1) / 2 resultado = tf.cast(resultado, tf.float32) return resultado
cfg.read(sys.argv[1]) print 'train:', cfg.get('data', 'train') print 'test:', cfg.get('data', 'test') print 'batch:', cfg.get('cnn', 'batch') print 'epochs:', cfg.get('cnn', 'epochs') print 'embdims:', cfg.get('cnn', 'embdims') print 'filters:', cfg.get('cnn', 'filters') print 'filtlen:', cfg.get('cnn', 'filtlen') print 'hidden:', cfg.get('cnn', 'hidden') print 'dropout:', cfg.get('cnn', 'dropout') print 'learnrt:', cfg.get('cnn', 'learnrt') # learn alphabets from training examples dataset = dataset.DatasetProvider(cfg.get('data', 'train')) # now load training examples and labels train_x1, train_x2, train_y = dataset.load(cfg.get('data', 'train')) maxlen = max([len(seq) for seq in train_x1]) # now load test examples and labels test_x1, test_x2, test_y = dataset.load(cfg.get('data', 'test'), maxlen=maxlen) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): print 'embeddings:', cfg.get('data', 'embed') word2vec = word2vec_model.Model(cfg.get('data', 'embed')) init_vectors = [word2vec.select_vectors(dataset.word2int)] # turn x and y into numpy array among other things classes = len(set(train_y)) train_x1 = pad_sequences(train_x1, maxlen=maxlen)
pass def share_data(self, data, dtype): if(data.dtype != np.dtype(dtype)): data = data.astype(dtype) return tn.shared(data, borrow=borrow) if __name__ == '__run__': data_file = 'mnist.pkl.gz' learning_rate = 0.005 epochs = 10000 batch_size = 500 borrow = True data = dataset.load(data_file, True) train_set, valid_set, test_set = data m, n = train_set[0].shape k = np.max(train_set[1]) + 1 print('data:', train_set[0].shape, train_set[1].shape, m, n, k) classifier = Softmaxclassifier(n_in = n, n_out = k) trainer = SoftmaxclassifierTrainer( train_set, m, n, k, valid_data = valid_set, classifier = classifier ) del(data)
from keras.preprocessing import sequence from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM, SimpleRNN, GRU NFOLDS = 10 BATCH = 50 EPOCHS = 5 EMBDIMS = 300 if __name__ == "__main__": dataset = dataset.DatasetProvider() x, y = dataset.load() print 'x shape:', x.shape print 'y shape:', y.shape scores = [] folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS, shuffle=True) for fold_num, (train_indices, test_indices) in enumerate(folds): train_x = x[train_indices] train_y = y[train_indices] test_x = x[test_indices] test_y = y[test_indices] model = k.models.Sequential() model.add(LSTM(128, input_length=205845, input_dim=300)) # model.add(Dense(128, input_shape=(EMBDIMS,)))
#!/usr/bin/env python # encode: utf-8 # Active Learning (Uncertainly Sampling) # This code is available under the MIT License. # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. import numpy import dataset from sklearn.linear_model import LogisticRegression categories = ['crude', 'money-fx', 'trade', 'interest', 'ship', 'wheat', 'corn'] doclist, labels, voca, vocalist = dataset.load(categories) print "document size : %d" % len(doclist) print "vocaburary size : %d" % len(voca) data = numpy.zeros((len(doclist), len(voca))) for j, doc in enumerate(doclist): for i, c in doc.iteritems(): data[j, i] = c def activelearn(data, label, strategy, train): print strategy N, D = data.shape train = list(train) # copy initial indexes of training pool = range(N) for x in train: pool.remove(x) predict = None precisions = []
def main(_): graph = tf.Graph() with graph.as_default(): with graph.device(device_for_node_cpu): print('-' * 120) print('C2S task = {t}'.format(t=FLAGS.task)) print(' data = {data}'.format(data=FLAGS.data)) print(' max_epochs = {max_epochs}'.format(max_epochs=FLAGS.max_epochs)) print(' batch_size = {batch_size}'.format(batch_size=FLAGS.batch_size)) print(' learning_rate = {learning_rate}'.format(learning_rate=FLAGS.learning_rate)) print(' decay = {decay}'.format(decay=FLAGS.decay)) print(' beta1 = {beta1}'.format(beta1=FLAGS.beta1)) print(' beta2 = {beta2}'.format(beta2=FLAGS.beta2)) print(' epsilon = {epsilon}'.format(epsilon=FLAGS.epsilon)) print(' pow = {pow}'.format(pow=FLAGS.pow)) print(' regularization = {regularization}'.format(regularization=FLAGS.regularization)) print(' max_gradient_norm = {max_gradient_norm}'.format(max_gradient_norm=FLAGS.max_gradient_norm)) print(' use_inputs_prob_decay = {use_inputs_prob_decay}'.format( use_inputs_prob_decay=FLAGS.use_inputs_prob_decay)) print('-' * 120) train_set, test_set, idx2word_history, word2idx_history, idx2word_target, word2idx_target = dataset.load( mode=FLAGS.task, text_data_fn=FLAGS.data ) print('Input vocabulary size: ', len(idx2word_history)) print('Output vocabulary size: ', len(idx2word_target)) print('-' * 120) train(train_set, test_set, idx2word_history, word2idx_history, idx2word_target, word2idx_target)
def dft_data_load(): return dataset.load(name='mnist.pkl.gz')
from dataset import load from sklearn import cross_validation import argparse import tensorflow as tf parser = argparse.ArgumentParser() parser.add_argument("-t", "--train", action="store_true", help="Train model with dataset otherwise load it.") args = parser.parse_args() # Load our dataset X, y = load() # Split dataset into train / test X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() # Build fully connected DNN with tf.Session() as session: # Run the initializer session.run(init) with tf.device('/gpu:0'): # Enable logging tf.logging.set_verbosity(tf.logging.INFO) # Create our classifier feature_columns = [tf.contrib.layers.real_valued_column("", dimension=1024)] classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns, hidden_units=[400, 400, 400, 400, 400, 400], n_classes=4, model_dir="model") if args.train: print('Training...') classifier.fit(X_train, y_train, steps=2000) print('Done !')
direct_model = inducer.learn(d.str,xtrain,ytrain) #print direct_model loo_pred = loo_model.predict(xtest) #print loo_pred if np.isnan(loo_pred).any(): print 'loo is wrong' exit() direct_pred = direct_model.predict(xtest) #print direct_pred if np.isnan(direct_pred).any(): print 'direct is wrong' return np.abs(loo_pred-direct_pred) if __name__ == '__main__': d = dataset.load('../data/vehicle.mat') x = d.x y = d.y s = nx.complete_graph(d.str.n_vars) cv = LeaveOneOut(d.str.n_instances) print cv options = {} # cgn = ml_cgn_inducer.wcGJAN_bn_learner(d.str, x, y,options) # ml_model = ml_cgn_inducer.learn_parameters(d.str,x,y,cgn,options) bma_ind = bma_gct_inducer.bma_gct_inducer(options) #cliques,separators = bma_gct_inducer.wcGJAN_ct_learner(d.str, x, y,options) print "Learning main model" bma_model = bma_ind.learn(d.str,x,y) print "Main model induced" print bma_model
print 'train:', train_file print 'test:', test_file print 'batch:', cfg.get('cnn', 'batch') print 'epochs:', cfg.get('cnn', 'epochs') print 'embdims:', cfg.get('cnn', 'embdims') print 'filters:', cfg.get('cnn', 'filters') print 'filtlen:', cfg.get('cnn', 'filtlen') print 'hidden:', cfg.get('cnn', 'hidden') print 'dropout:', cfg.get('cnn', 'dropout') print 'learnrt:', cfg.get('cnn', 'learnrt') # learn alphabets from training examples dataset = dataset.DatasetProvider(train_file) # now load training examples and labels train_left, train_larg, train_middle, \ train_rarg, train_right, train_y = dataset.load(train_file) left_maxlen = max([len(seq) for seq in train_left]) larg_maxlen = max([len(seq) for seq in train_larg]) middle_maxlen = max([len(seq) for seq in train_middle]) rarg_maxlen = max([len(seq) for seq in train_rarg]) right_maxlen = max([len(seq) for seq in train_right]) # now load test examples and labels test_left, test_larg, test_middle, test_rarg, test_right, test_y = \ dataset.load(test_file, left_maxlen=left_maxlen, larg_maxlen=larg_maxlen, middle_maxlen=middle_maxlen, rarg_maxlen=rarg_maxlen, right_maxlen=right_maxlen) # turn x and y into numpy array among other things classes = len(set(train_y)) train_left = pad_sequences(train_left, maxlen=left_maxlen)
def train(itMax=100, szBatch=256, lr=0.01, vaFreq=10, init_theta=dft_init_theta, mo_create=models.create_mlp): print 'loading data...' dataTr, dataVa, _ = dataset.load(name='mnist.pkl.gz') print 'building graph...' # fprop: the MLP model x = T.matrix('x', 'float32') theta = init_theta() F = mo_create(x, theta) # fprop: the loss y = T.ivector('y') ell = loss.create_logistic(F, y) # bprop dtheta = T.grad(ell, wrt=theta) # the graph for training ibat = T.lscalar('ibat') fg_tr = function( inputs=[ibat], outputs=ell, updates=zip(theta, optim.update_gd(theta, dtheta)), givens={ x: dataset.get_batch(ibat, dataTr[0], szBatch=szBatch), y: dataset.get_batch(ibat, dataTr[1], szBatch=szBatch) } ) # the graph for validation ell_zo = loss.create_zeroone(F, y) fg_va = function( inputs=[], outputs=ell_zo, givens={ x: dataVa[0], y: dataVa[1] } ) print 'Fire the graph...' trLoss, er_va = [], [] N = dataTr[0].get_value(borrow=True).shape[0] numBatch = (N + szBatch) / szBatch print '#batch = %d' % (numBatch,) for i in xrange(itMax): ibat = i % numBatch tmpLoss = fg_tr(ibat) print 'training: iteration %d, ibat = %d, loss = %6.5f' % (i, ibat, tmpLoss) trLoss.append(tmpLoss) if i%vaFreq == 0: tmp_er = fg_va() print 'validation: iteration %d, error rate = %6.5f' % (i, tmp_er) er_va.append(tmp_er) # plot import matplotlib.pyplot as plt plt.subplot(1, 2, 1) plt.plot(range(1, len(trLoss)+1), trLoss, 'ro-') plt.subplot(1, 2, 2) plt.plot([i*vaFreq for i in range(len(er_va))], er_va, 'bx-') plt.show(block=True) # return the parameters return theta
print 'test:', cfg.get('data', 'test') print 'batch:', cfg.get('lstm', 'batch') print 'epochs:', cfg.get('lstm', 'epochs') print 'embdims:', cfg.get('lstm', 'embdims') print 'units:', cfg.get('lstm', 'units') print 'dropout:', cfg.get('lstm', 'dropout') print 'udropout:', cfg.get('lstm', 'udropout') print 'wdropout:', cfg.get('lstm', 'wdropout') print 'learnrt:', cfg.get('lstm', 'learnrt') # learn alphabet from training data dataset = \ dataset.DatasetProvider([cfg.get('data', 'train'), cfg.get('data', 'test')]) # now load training examples and labels train_x, train_y = dataset.load(cfg.get('data', 'train')) # now load test examples and labels test_x, test_y = dataset.load(cfg.get('data', 'test')) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x + test_x]) train_x = pad_sequences(train_x, maxlen=maxlen) train_y = pad_sequences(train_y, maxlen=maxlen) test_x = pad_sequences(test_x, maxlen=maxlen) test_y = pad_sequences(test_y, maxlen=maxlen) train_y = np.array([to_categorical(seq, 3) for seq in train_y]) test_y = np.array([to_categorical(seq, 3) for seq in test_y]) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape
test_file = os.path.join(base, cfg.get('data', 'test')) print 'train:', train_file print 'test:', test_file print 'batch:', cfg.get('cnn', 'batch') print 'epochs:', cfg.get('cnn', 'epochs') print 'embdims:', cfg.get('cnn', 'embdims') print 'filters:', cfg.get('cnn', 'filters') print 'filtlen:', cfg.get('cnn', 'filtlen') print 'hidden:', cfg.get('cnn', 'hidden') print 'dropout:', cfg.get('cnn', 'dropout') print 'learnrt:', cfg.get('cnn', 'learnrt') # learn alphabet from training examples dataset = dataset.DatasetProvider(train_file) # now load training examples and labels train_x, train_y = dataset.load(train_file) maxlen = max([len(seq) for seq in train_x]) # now load test examples and labels test_x, test_y = dataset.load(test_file, maxlen=maxlen) init_vectors = None # TODO: what what are we doing for index 0 (oov words)? # use pre-trained word embeddings? if cfg.has_option('data', 'embed'): print 'embeddings:', cfg.get('data', 'embed') embed_file = os.path.join(base, cfg.get('data', 'embed')) word2vec = word2vec.Model(embed_file) init_vectors = [word2vec.select_vectors(dataset.word2int)] # turn x and y into numpy array among other things classes = len(set(train_y))
def train(dataset, gpu, num_layer=4, epoch=40, batch=64): nb_epochs = epoch batch_size = batch patience = 20 lr = 0.001 l2_coef = 0.0 hid_units = 512 adj, diff, feat, labels, num_nodes = load(dataset) feat = torch.FloatTensor(feat).cuda() diff = torch.FloatTensor(diff).cuda() adj = torch.FloatTensor(adj).cuda() labels = torch.LongTensor(labels).cuda() ft_size = feat[0].shape[1] max_nodes = feat[0].shape[0] model = Model(ft_size, hid_units, num_layer) optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef) model.cuda() cnt_wait = 0 best = 1e9 itr = (adj.shape[0] // batch_size) + 1 for epoch in range(nb_epochs): epoch_loss = 0.0 train_idx = np.arange(adj.shape[0]) np.random.shuffle(train_idx) for idx in range(0, len(train_idx), batch_size): model.train() optimiser.zero_grad() batch = train_idx[idx:idx + batch_size] mask = num_nodes[idx:idx + batch_size] lv1, gv1, lv2, gv2 = model(adj[batch], diff[batch], feat[batch], mask) lv1 = lv1.view(batch.shape[0] * max_nodes, -1) lv2 = lv2.view(batch.shape[0] * max_nodes, -1) batch = torch.LongTensor( np.repeat(np.arange(batch.shape[0]), max_nodes)).cuda() loss1 = local_global_loss_(lv1, gv2, batch, 'JSD', mask) loss2 = local_global_loss_(lv2, gv1, batch, 'JSD', mask) # loss3 = global_global_loss_(gv1, gv2, 'JSD') loss = loss1 + loss2 #+ loss3 epoch_loss += loss loss.backward() optimiser.step() epoch_loss /= itr # print('Epoch: {0}, Loss: {1:0.4f}'.format(epoch, epoch_loss)) if epoch_loss < best: best = epoch_loss best_t = epoch cnt_wait = 0 torch.save(model.state_dict(), f'{dataset}-{gpu}.pkl') else: cnt_wait += 1 if cnt_wait == patience: break model.load_state_dict(torch.load(f'{dataset}-{gpu}.pkl')) features = feat.cuda() adj = adj.cuda() diff = diff.cuda() labels = labels.cuda() embeds = model.embed(features, adj, diff, num_nodes) x = embeds.cpu().numpy() y = labels.cpu().numpy() from sklearn.svm import LinearSVC from sklearn.metrics import accuracy_score params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None) accuracies = [] for train_index, test_index in kf.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] classifier = GridSearchCV(LinearSVC(), params, cv=5, scoring='accuracy', verbose=0) classifier.fit(x_train, y_train) accuracies.append(accuracy_score(y_test, classifier.predict(x_test))) print(np.mean(accuracies), np.std(accuracies))
# normalize within r bin... generalize me h = np.apply_along_axis(lambda x: x/np.sum(x), 1, h) # set zero-content bins to 0.1 * minimum nonzero bin min_val = np.min(h[h > 0]) h[(h == 0)] = min_val / 10 return h, e if __name__ == '__main__': import dataset import glob try: d = dataset.load('tl208') except IOError: d = dataset.Dataset('tl208', filenames=glob.glob('/home/mastbaum/snoplus/tl208/data/pdf/tl208/run0/av_tl208-0.root')) d.append(glob.glob('/home/mastbaum/snoplus/tl208/data/pdf/tl208/run1/av_tl208-*.root')) cut = dataset.Cut(e=(2.555,2.718)) d.apply_cuts([cut]) events = d.cut[cut.as_tuple()]['events'] h, e = make_pdf(events, ['r', 'pmt_t_res'], (10, 500,)) import matplotlib.pyplot as plt import matplotlib.cm as cm from matplotlib.colors import LogNorm
def main(): with tf.Session() as session: dataset = dataset_mod.load() train(session, dataset)
# settings file specified as command-line argument cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) print_config(cfg) base = os.environ['DATA_ROOT'] train_file = os.path.join(base, cfg.get('data', 'train')) test_file = os.path.join(base, cfg.get('data', 'test')) # learn alphabet from training examples dataset = dataset.DatasetProvider(train_file) print 'input alphabet size:', len(dataset.input2int) print 'output alphabet size:', len(dataset.output2int) # now load training examples and labels train_x, train_y = dataset.load(train_file) maxlen_x = max([len(seq) for seq in train_x]) maxlen_y = max([len(seq) for seq in train_y]) # turn x and y into numpy array among other things train_x = pad_sequences(train_x, maxlen=maxlen_x) train_y = pad_sequences(train_y, maxlen=maxlen_y) print train_y.shape print train_y # convert train_y into (num_examples, maxlen, alphabet_size) # train_y = to_categorical(np.array(train_y), classes) model = Sequential() model.add(Embedding(input_dim=len(dataset.input2int), output_dim=cfg.getint('cnn', 'embdims'),