def run_cv(fold_iterator, logger, params_dict, upsample=True): for traindirs, testdirs in fold_iterator: # TRAIN LOCAL PREDICTION MODEL # Generators logger.info('############ FOLD #############') logger.info('Training folders are {}'.format(traindirs)) training_generator = DataLoader(data_dir, traindirs, 32, width_template=params_dict['width'], upsample=upsample) validation_generator = DataLoader(data_dir, testdirs, 32, width_template=params_dict['width'], type='val', upsample=upsample) # Design model model = create_model(params_dict['width'] + 1, params_dict['h1'], params_dict['h2'], params_dict['h3'], embed_size=params_dict['embed_size'], drop_out_rate=params_dict['dropout_rate'], use_batch_norm=params_dict['use_batchnorm']) # Train model on training dataset ''' model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=6) ''' try: model.load_weights(os.path.join(checkpoint_dir, 'model22.h5')) except OSError: print('here') model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=4, max_queue_size=20) model.save_weights(os.path.join(checkpoint_dir, 'model.h5')) metrics = model.evaluate_generator(generator=validation_generator, workers=4, max_queue_size=20) logger.info(metrics)
def starcraft_sp_test(): # Create DataLoader instance to load and format data dataLoader = DataLoader() logging.info("Program started") logging.info("Loading starcraft data") # Read skillcraft dataset, the class index is the second column dataLoader.read(filename="data/SkillCraft1_Dataset.csv", classIndex=1, numOfFeatures=15) # Normalize data values from 0 - 1 #dataLoader.normalize() # Create new labels to fit into binary classification dataLoader.scaleToBinary(5) # Spectral Clustering # Binary clustering(dataLoader.x_train, dataLoader.y_train, writer_starcraft, 'starcraft-binary', multiple=True, binary=True) # Multiclass #clustering(dataLoader.x_train, dataLoader.multi_y_train, writer_starcraft, 'starcraft-multiclass', multiple=True, binary=False) # Write all the results writer_starcraft.save()
def load_data(filePath: str, label_txt_filePath: str, shuffle: bool = True, seq_length: int = 3000, batch_size: int = 64, training: bool = True): voc = Vocab() dataLoader = DataLoader() # 全部数据 dataLoader.sequences = dataLoader.read_fasta_file(fasta_file_path=filePath) # 训练集 dataLoader.train_seq = dataLoader.sequences[:900] # 测试集 dataLoader.test_seq = dataLoader.sequences[900:1000] # 标签,0/1 dataLoader.labels = dataLoader.read_label_txt( label_file_path=label_txt_filePath) # 训练集的向量表示 dataLoader.train_vectorized_seq = voc.sequences_to_ids( dataLoader.train_seq) # 测试集的向量表示 dataLoader.test_vectorized_seq = voc.sequences_to_ids(dataLoader.test_seq) # print(dataLoader.train_vectorized_seq) # print(dataLoader.test_vectorized_seq) # x_batch, y_batch = dataLoader.get_batch(shuffle=shuffle, seq_length=seq_length, batch_size=batch_size, training=training) # print("x_batch.shape={}, y_batch.shape={}".format(x_batch.shape, y_batch.shape)) # print("x_batch[0]:{}".format(x_batch[0])) # print("y_batch[0]:{}".format(y_batch[0])) return voc, dataLoader
def main(): file_name = 'input/BioASQ-task6bPhaseB-testset3.json' file_name = 'input/BioASQ-trainingDataset6b.json' file_name = 'input/BioASQ-trainingDataset5b.json' file_name = 'input/phaseB_5b_05.json' save_model_file_name = 'weights_2' ranker = SVMRank(save_model_file_name) data = DataLoader(file_name) data.load_ner_entities() ans_file = 'output/factoid_list_%s.json' % data.name questions = data.get_questions_of_type(C.FACTOID_TYPE) for i, question in enumerate(tqdm(questions)): ranked_sentences = question.ranked_sentences() X, candidates = get_only_features(question, ranked_sentences) top_answers = ranker.classify_from_feed(X, candidates, i) question.exact_answer = [[answer] for answer in top_answers[:5]] # question.exact_answer = [answer for answer in top_answers] # print question.exact_answer_ref # print '\n' # print top5 # print '\n' # print '\n\n\n' questions = data.get_questions_of_type(C.LIST_TYPE) for i, question in enumerate(tqdm(questions)): ranked_sentences = question.ranked_sentences() X, candidates = get_only_features(question, ranked_sentences) top_answers = ranker.classify_from_feed(X, candidates, i) question.exact_answer = [[answer] for answer in top_answers[:10]] data.save_factoid_list_answers(ans_file)
def get_data_loaders(data_root, speaker_id, test_shuffle=True): data_loaders = {} local_conditioning = hparams.cin_channels > 0 for phase in ["train", "test"]: train = phase == "train" X = FileSourceDataset( RawAudioDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("[{}]: length of the dataset is {}".format(phase, len(X))) if train: lengths = np.array(X.file_data_source.lengths) # Prepare sampler sampler = PartialyRandomizedSimilarTimeLengthSampler( lengths, batch_size=hparams.batch_size) shuffle = False else: sampler = None shuffle = test_shuffle dataset = PyTorchDataset(X, Mel) data_loader = DataLoader(dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle, collate_fn=collate_fn, pin_memory=hparams.pin_memory) speaker_ids = {} for idx, (x, c, g) in enumerate(dataset): if g is not None: try: speaker_ids[g] += 1 except KeyError: speaker_ids[g] = 1 if len(speaker_ids) > 0: print("Speaker stats:", speaker_ids) data_loaders[phase] = data_loader return data_loaders
def _build_data(self, data_dir='train_dir', num_classes=10, mode='train'): loader = DataLoader(data_dir=data_dir, num_classes=num_classes, mode=mode, height=self.height, width=self.width) dataset = tf.data.Dataset.from_generator(generator=loader.generator, output_types=(tf.float32, tf.int32), output_shapes=(tf.TensorShape([self.height, self.width, 3]), tf.TensorShape([self.num_classes]))) return dataset
def _reading_data(): print(config.USER) # step2 the way to load_data # load data contains : # the way to load data # the way to preprocess with data # doing some special data cleaning process trainFilepath = os.path.join(os.getcwd(), "data", config.FILENAME) trainDataLoader = DataLoader(trainFilepath) train_data = trainDataLoader.load_data(useSpark=False, interactive=False) train_data.save_data(os.getcwd())
def main(): ranker = SVMRank() file_name = 'input/BioASQ-trainingDataset6b.json' data = DataLoader(file_name) data.load_ner_entities() questions = data.get_questions_of_type(C.FACTOID_TYPE)[:419] for i, question in enumerate(questions): ranked_sentences = question.ranked_sentences() X, y = get_features(question, ranked_sentences) ranker.feed(X, y, i) ranker.train_from_feed() ranker.save('weights_2')
def test_real_dataset(create_obj_func, src_name=None, trg_name=None, show=False, block_figure_on_end=False): print('Running {} ...'.format(os.path.basename(__file__))) if src_name is None: if len(sys.argv) > 2: src_name = sys.argv[2] else: raise Exception('Not specify source dataset') if trg_name is None: if len(sys.argv) > 3: trg_name = sys.argv[3] else: raise Exception('Not specify trgget dataset') np.random.seed(random_seed()) tf.set_random_seed(random_seed()) tf.reset_default_graph() print("========== Test on real data ==========") users_params = dict() users_params = parse_arguments(users_params) data_format = 'mat' if 'format' in users_params: data_format, users_params = extract_param('format', data_format, users_params) data_loader = DataLoader(src_domain=src_name, trg_domain=trg_name, data_path=data_dir(), data_format=data_format, cast_data=users_params['cast_data']) assert users_params['batch_size'] % data_loader.num_src_domain == 0 print('users_params:', users_params) learner = create_obj_func(users_params) learner.dim_src = data_loader.data_shape learner.dim_trg = data_loader.data_shape learner.x_trg_test = data_loader.trg_test[0][0] learner.y_trg_test = data_loader.trg_test[0][1] learner._init(data_loader) learner._build_model() learner._fit_loop()
def test(self, data_dir='test_b', model_dir=None, output_dir=None, threshold=0.5): print("testing starts.") loader = DataLoader(data_dir=data_dir, mode='test', height=self.height, width=self.width, label_value=self.label_values) testset = tf.data.Dataset.from_generator( generator=loader.generator, output_types=(tf.string, tf.int32, tf.float32), output_shapes=(tf.TensorShape([]), tf.TensorShape([2]), tf.TensorShape([self.height, self.width, 3]))) testset = testset.batch(1) testset = testset.prefetch(10) test_init = self.it.make_initializer(testset) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, model_dir) sess.run(test_init) queue = multiprocessing.Queue(maxsize=30) writer_process = multiprocessing.Process( target=writer, args=[output_dir, self.label_values, queue, 'stop']) writer_process.start() print('writing predictions...') try: while True: img, path, size, output_image = sess.run( [self.img, self.path, self.size, self.logits]) queue.put(('continue', path, size, img, output_image)) except tf.errors.OutOfRangeError: queue.put(('stop', None, None, None, None)) print('testing finished.')
def prepare_data_loader_train_10_splits(texture_train_data_set_path, texture_train_label_set_path, texture_val_data_set_path, texture_val_label_set_path, texture_batch_size, num_workers, device): data_loader_list = [] for i in range(10): idx = i + 1 print("Split: {0}".format(idx)) texture_train_data_set_path = texture_train_data_set_path.format(idx) texture_train_label_set_path = texture_train_label_set_path.format(idx) texture_val_data_set_path = texture_val_data_set_path.format(idx) texture_val_label_set_path = texture_val_label_set_path.format(idx) dL = DataLoader() texture_train_set, train_set_size = dL.get_tensor_set( texture_train_data_set_path, texture_train_label_set_path, device) texture_val_set, val_set_size = dL.get_tensor_set( texture_val_data_set_path, texture_val_label_set_path, device) print("Train set size: {0}".format(train_set_size)) print("Val set size: {0}".format(val_set_size)) texture_train_data_loader = torch.utils.data.DataLoader( texture_train_set, batch_size=texture_batch_size, shuffle=True, num_workers=num_workers) texture_val_data_loader = torch.utils.data.DataLoader(texture_val_set, num_workers=1, shuffle=False, pin_memory=True) data_loader_dict = { "train": texture_train_data_loader, "val": texture_val_data_loader } data_loader_list.append(data_loader_dict) return data_loader_list
def prepare_data_loader_test_10_splits(texture_test_data_set_path, texture_test_label_set_path, device): data_loader_list = [] for i in range(10): idx = i + 1 print("Split: {0}".format(idx)) texture_test_data_set_path = texture_test_data_set_path.format(idx) texture_test_label_set_path = texture_test_label_set_path.format(idx) dL = DataLoader() texture_test_set, test_set_size = dL.get_tensor_set( texture_test_data_set_path, texture_test_label_set_path, device) print("Test set size: {0}".format(test_set_size)) test_data_loader = torch.utils.data.DataLoader(texture_test_set, num_workers=1, shuffle=False, pin_memory=True) data_loader_list.append(test_data_loader) return data_loader_list
def test(self, data_dir='test', model_dir=None, output_dir='result', batch_size=10): print("testing starts.") if not os.path.exists(output_dir): os.mkdir(output_dir) # load test data loader = DataLoader(data_dir=data_dir, num_classes=self.num_classes, mode='test', height=self.height, width=self.width) testset = tf.data.Dataset.from_generator(generator=loader.generator, output_types=(tf.string, tf.float32), output_shapes=(tf.TensorShape([]), tf.TensorShape([self.height, self.width, 3]))) testset = testset.shuffle(100) testset = testset.batch(batch_size) testset = testset.prefetch(20) test_init = self.it.make_initializer(testset) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, model_dir) sess.run(test_init) queue = multiprocessing.Queue(maxsize=30) writer_process = multiprocessing.Process(target=writer, args=[output_dir, batch_size, queue, 'stop']) writer_process.start() print('writing predictions...') try: while True: img_name, pre_label = sess.run([self.img_name, self.prediction_value]) queue.put(('continue', img_name, pre_label)) except tf.errors.OutOfRangeError: queue.put(('stop', None, None)) print('testing finished.')
def integrated_benchmark(dataset_path): """ Variables: Dataset size: number of columns Dataset distribution: column length distribution threshold query column """ loader = DataLoader("") dataset = loader.load_dataset(dataset_path) bf_lists, lsh_list = init(dataset) print(""" Benchmark 1 Goal: Measure scalability of different methods Variable: the size of datasets. size: 400, 600, 800, 1000 Fix: threshold = 0.6 query column = median col Output: Runtime precision, recall, f1 """) labels = ["bloom filter", "lsh", "lsh ensemble", "lsh + bloom filter"] time_for_each_size = np.empty((len(dataset), len(labels)), dtype=float) x_axis = np.empty(len(dataset), dtype=int) for i, cols in enumerate(dataset): candidate_index = len(cols) // 2 # median col brute_force_result = brute_force(candidate_index, cols, 0.6) print("brute_force finished\n") time = benchmark(cols, candidate_index, 0.6, bf_lists[i], lsh_list[i], brute_force_result, "Benchmark-1-cols-size-" + str(len(cols))) time_for_each_size[i] = time x_axis[i] = len(cols) fig, ax = plt.subplots() for i in range(len(labels)): ax.plot(x_axis, time_for_each_size[:, i], 'o-', label=labels[i]) ax.legend() ax.set_title("Benchmark-1-cols-size") ax.set_xticks(x_axis) ax.set_xlabel("size") ax.set_ylabel("time(s)") fig.tight_layout() # plt.show() fig.savefig("./bench_results/Benchmark-1-cols-size") print(""" Benchmark 2 Goal: Measure the effect of threshold Variable: threshold: 0.1 0.3 0.5 0.7 0.9 Fix: dataset size = median col Output Runtime precision, recall, f1 """) threshold_list = [0.1, 0.3, 0.5, 0.7, 0.9] time_for_each_threshold = np.empty((len(threshold_list), len(labels)), dtype=float) x_axis = np.empty(len(threshold_list), dtype=float) cols_index = len(dataset) // 2 cols = dataset[cols_index] for i in range(len(threshold_list)): threshold = threshold_list[i] candidate_index = len(cols) // 2 # median col brute_force_result = brute_force(candidate_index, cols, threshold) print("brute_force finished\n") time = benchmark( cols, candidate_index, threshold, bf_lists[cols_index], lsh_list[cols_index], brute_force_result, "Benchmark-2-threshold-" + str(int(threshold * 100)) + "%") time_for_each_threshold[i] = time x_axis[i] = threshold fig, ax = plt.subplots() for i in range(len(labels)): ax.plot(x_axis, time_for_each_threshold[:, i], 'o-', label=labels[i]) ax.legend() ax.set_title("Benchmark-2-threshold") ax.set_xticks(x_axis) ax.set_xlabel("threshold") ax.set_ylabel("time(s)") fig.tight_layout() # plt.show() fig.savefig("./bench_results/Benchmark-2-threshold") print(""" Benchmark 3 Goal: Measure the effect of query column Variable: query column = small col, median col, large col Fix: dataset size = median size cols threshold = 0.6 Output Runtime precision, recall, f1 """) cols_index = len(dataset) // 2 cols = dataset[cols_index] label = ["small-col", "median-col", "large-col"] for i, candidate_index in enumerate([0, len(cols) // 2, len(cols) - 1]): brute_force_result = brute_force(candidate_index, cols, 0.6) benchmark(cols, candidate_index, 0.6, bf_lists[cols_index], lsh_list[cols_index], brute_force_result, "Benchmark-3-candidate-" + label[i])
from dataLoader import DataLoader loader = DataLoader() loader.loadAll() fileobj = open("csv/subjectAreaDump.csv", 'w') for id, paper in loader.papers.iteritems(): if paper.accepted: fileobj.write("%s|%d|%s" % (paper.primarySpecificSubjectArea, id, paper.title)) for subj in paper.specificSubjectAreas: fileobj.write("|" + subj) fileobj.write("\n") fileobj.close()
from pathlib import Path from flask import Flask, render_template, make_response, jsonify, request, send_from_directory import configurations from analyzeResults import AnalyzeResults from dataLoader import DataLoader from hitCounter import HitCounter import numpy as np from vistDataset import VistDataset import base64 import time app = Flask(__name__) data_loader = DataLoader(root_path=configurations.root_data) hit_counter = HitCounter(root_path=configurations.root_data, story_max_hits=configurations.max_story_submit) vist_dataset = VistDataset(root_path=configurations.root_data, hit_counter=hit_counter, samples_num=configurations.samples) analyze_results = AnalyzeResults(data_root=configurations.root_data, data_loader=data_loader, vist_dataset=vist_dataset) @app.route('/api/images/<image_id>', methods=['GET']) def serve_image(image_id): print("Requested image file: {}".format(image_id)) image_path = data_loader._find_file(image_id) if image_path is None:
# 该函数由register_forward_hook调用,类似于event handler,当resnet前向传播时记录所需中间层结果 def hook_feature(module, input, output): features_blobs.append(output.data.cpu().numpy()) # 需要输出的中间层名称,名称为resnet_for_vis的__init__函数中声明的。 finalconv_name = 'layer3' # print(model._modules) # model._modules.layer3.register_forward_hook(hook_feature) # model._modules['module'].layer3.register_forward_hook(hook_feature) model._modules.get(finalconv_name).register_forward_hook(hook_feature) # Data Loader loader = DataLoader(args_dataset, batch_size=args_batch_size) dataloaders, dataset_sizes = loader.load_data() labels_name = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868'] def visulize(train_data, labels): print('PCA Embedding') tsne = TSNE(n_components=2) embed_feature = [] batch_size = 10000 slices = 0 while slices + batch_size <= len(train_data): print('processing %d/%d' % (slices, len(train_data))) tsne.fit_transform(train_data[slices:slices + batch_size])
def train(traindirs, data_dir, upsample, params_dict, checkpointdir, logger, validation_gen=None): if logger is not None: logger.info('Training folders are {}'.format(traindirs)) else: print('Training folders are {}'.format(traindirs)) training_generator = DataLoader(data_dir, traindirs, 32, width_template=params_dict['width'], upsample=upsample) earl = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3) # Design model model = create_model(params_dict['width'] + 1, params_dict['h1'], params_dict['h2'], params_dict['h3'], embed_size=params_dict['embed_size'], drop_out_rate=params_dict['dropout_rate'], use_batch_norm=params_dict['use_batchnorm']) # Train local Net if validation_gen is None: model.fit_generator(generator=training_generator, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=4, max_queue_size=20, callbacks=[earl]) else: model.fit_generator(generator=training_generator, validation_data=validation_gen, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=4, max_queue_size=20) if logger is not None: logger.info('Local Net trained') logger.info('Stopped epoch {}'.format(earl.stopped_epoch)) else: print('Local Net trained') print('Stopped epoch {}'.format(earl.stopped_epoch)) # Train the temporal model for folder in traindirs: if logger is not None: logger.info('Getting temporal training set for {}'.format(folder)) else: print('Getting temporal training set for {}'.format(folder)) img_dir = os.path.join(data_dir, folder, 'Data') annotation_dir = os.path.join(data_dir, folder, 'Annotation') list_label_files = [ os.path.join(annotation_dir, dI) for dI in os.listdir(annotation_dir) if (dI.endswith('txt') and not dI.startswith('.')) ] try: img_init = np.asarray( Image.open(os.path.join(img_dir, "{:04d}.png".format(1)))) except FileNotFoundError: img_init = np.asarray( Image.open(os.path.join(img_dir, "{:05d}.png".format(1)))) list_imgs = [ os.path.join(img_dir, dI) for dI in os.listdir(img_dir) if (dI.endswith('png') and not dI.startswith('.')) ] n_obs = len(list_imgs) X0, X1, X2, X3, X4, X5 = [], [], [], [], [], [] Y0, Y1, Y2, Y3, Y4, Y5 = [], [], [], [], [], [] for label in list_label_files: print(label) df = pd.read_csv(os.path.join(annotation_dir, label), header=None, names=['id', 'x', 'y'], sep='\s+') c1_interpolate = np.interp(np.arange(1, n_obs + 1), df.id.values, df.x.values) c2_interpolate = np.interp(np.arange(1, n_obs + 1), df.id.values, df.y.values) n = len(c1_interpolate) X0 = np.append(X0, c1_interpolate) X1 = np.append(X1, c1_interpolate[1:n]) X2 = np.append(X2, c1_interpolate[2:n]) X3 = np.append(X3, c1_interpolate[3:n]) X4 = np.append(X4, c1_interpolate[4:n]) X5 = np.append(X5, c1_interpolate[5:n]) Y0 = np.append(Y0, c2_interpolate) Y1 = np.append(Y1, c2_interpolate[1:n]) Y2 = np.append(Y2, c2_interpolate[2:n]) Y3 = np.append(Y3, c2_interpolate[3:n]) Y4 = np.append(Y4, c2_interpolate[4:n]) Y5 = np.append(Y5, c2_interpolate[5:n]) l = len(X5) fullX = np.transpose( np.vstack([X0[0:l], X1[0:l], X2[0:l], X3[0:l], X4[0:l]])) fullY = np.transpose( np.vstack([Y0[0:l], Y1[0:l], Y2[0:l], Y3[0:l], Y4[0:l]])) c1_label = X5 c2_label = Y5 est_c1 = RidgeCV() est_c2 = RidgeCV() scores_c1 = cross_validate(est_c1, fullX, c1_label, cv=5, scoring=('r2', 'neg_mean_squared_error')) scores_c2 = cross_validate(est_c2, fullY, c2_label, cv=5, scoring=('r2', 'neg_mean_squared_error')) if logger is not None: logger.info('c1') logger.info(scores_c1['test_neg_mean_squared_error']) logger.info('c2') logger.info(scores_c2['test_neg_mean_squared_error']) else: print('c1') print(scores_c1['test_neg_mean_squared_error']) print('c2') print(scores_c2['test_neg_mean_squared_error']) # Fit on the whole training set est_c1.fit(fullX, c1_label) est_c2.fit(fullY, c2_label) # Save the local Net and the temporal model if logger is not None: logger.info('Saving trained models to {}'.format(checkpoint_dir)) else: print('Saving trained models to {}'.format(checkpoint_dir)) model.save_weights(os.path.join(checkpoint_dir, 'model.h5')) dump(est_c1, os.path.join(checkpoint_dir, 'est_c1.joblib')) dump(est_c2, os.path.join(checkpoint_dir, 'est_c2.joblib')) return model, est_c1, est_c2
from dataLoader import DataLoader from crfBrandDetector import CrfBrandDetector if __name__ == '__main__': print('Preparing Data...') df = DataLoader().get_data() print('Building Model...') crf_model = CrfBrandDetector() print('Fitting...') x_train, x_test, y_train, y_test = crf_model.train_test_split(df) crf_model.fit(x_train, y_train) crf_model.report_classification(x_test, y_test) print('Accuracy: {}'.format(crf_model.evaluate(x_test, y_test))) pred = crf_model.predict(x_test) pred.to_csv('./pred.csv', index=False)
def run_global_cv(fold_iterator, data_dir, checkpoint_dir, logger, params_dict, upsample=True): eucl_dist_per_fold = [] pixel_dist_per_fold = [] for traindirs, testdirs in fold_iterator: # TRAIN LOCAL PREDICTION MODEL # Generators logger.info('############ FOLD #############') logger.info('Training folders are {}'.format(traindirs)) training_generator = DataLoader(data_dir, traindirs, 32, width_template=params_dict['width'], upsample=upsample) validation_generator = DataLoader(data_dir, testdirs, 32, width_template=params_dict['width'], type='val', upsample=upsample) model, est_c1, est_c2 = train(traindirs, data_dir, upsample, params_dict, checkpoint_dir, logger, validation_generator) # PREDICT WITH GLOBAL MATCHING + LOCAL MODEL ON TEST SET curr_fold_dist = [] curr_fold_pix = [] for k, testfolder in enumerate(testdirs): res_x, res_y = training_generator.resolution_df.loc[ training_generator.resolution_df['scan'] == testfolder, ['res_x', 'res_y']].values[0] annotation_dir = os.path.join(data_dir, testfolder, 'Annotation') img_dir = os.path.join(data_dir, testfolder, 'Data') list_imgs = [ os.path.join(img_dir, dI) for dI in os.listdir(img_dir) if (dI.endswith('png') and not dI.startswith('.')) ] list_label_files = [ dI for dI in os.listdir(annotation_dir) if (dI.endswith('txt') and not dI.startswith('.')) ] print(list_label_files) try: img_init = np.asarray( Image.open(os.path.join(img_dir, "{:04d}.png".format(1)))) except FileNotFoundError: img_init = np.asarray( Image.open(os.path.join(img_dir, "{:05d}.png".format(1)))) img_init = prepare_input_img(img_init, res_x, res_y, upsample) for j, label_file in enumerate(list_label_files): print(label_file) img_current = img_init df = pd.read_csv(os.path.join(annotation_dir, label_file), header=None, names=['id', 'x', 'y'], sep='\s+') if upsample: df['x_newres'] = df['x'] * res_x / 0.4 df['y_newres'] = df['y'] * res_y / 0.4 else: df['x_newres'] = df['x'] df['y_newres'] = df['y'] c1_init, c2_init = df.loc[ df['id'] == 1, ['x_newres', 'y_newres']].values[0, :] a, b = np.nonzero(img_init[:, 20:(len(img_init) - 20)]) if upsample: list_centers = [[ c1_init * 0.4 / res_x, c2_init * 0.4 / res_y ]] else: list_centers = [[c1_init, c2_init]] xax, yax = find_template_pixel(c1_init, c2_init, params_dict['width'], img_init.shape[1], img_init.shape[0]) template_init = img_init[np.ravel(yax), np.ravel(xax)].reshape( 1, len(yax), len(xax)) c1, c2 = c1_init, c2_init stop_temporal = False k = 0 for i in range(2, len(list_imgs) + 1): if i % 100 == 0: print(i) img_prev = img_current try: img_current = np.asarray( Image.open( os.path.join(img_dir, "{:04d}.png".format(i)))) except FileNotFoundError: img_current = np.asarray( Image.open( os.path.join(img_dir, "{:05d}.png".format(i)))) img_current = prepare_input_img(img_current, res_x, res_y, upsample) if i > 5: tmp = list_centers[-10:].reshape(-1, 2) assert tmp.shape[0] == 5 c1, c2, stop_temporal, k = get_next_center( k, stop_temporal, c1, c2, img_prev, img_current, params_dict, model, template_init, c1_init, c2_init, logger, est_c1, est_c2, tmp[:, 0], tmp[:, 1]) else: c1, c2, stop_temporal, k = get_next_center( k, stop_temporal, c1, c2, img_prev, img_current, params_dict, model, template_init, c1_init, c2_init, logger) # project back in init coords if upsample: c1_orig_coords = c1 * 0.4 / res_x c2_orig_coords = c2 * 0.4 / res_y else: c1_orig_coords = c1 c2_orig_coords = c2 list_centers = np.append(list_centers, [c1_orig_coords, c2_orig_coords]) if i in df.id.values: true = df.loc[df['id'] == i, ['x', 'y']].values[0] diff_x = np.abs(c1_orig_coords - true[0]) diff_y = np.abs(c2_orig_coords - true[1]) if upsample: dist = np.sqrt(diff_x**2 + diff_y**2) logger.info( 'ID {} : euclidean dist diff {}'.format( i, dist * 0.4)) else: dist = np.sqrt((res_x * diff_x)**2 + (diff_y * res_y)**2) logger.info( 'ID {} : euclidean dist diff {}'.format( i, dist)) if dist > 10: # logger.info( # 'Bad dist - maxNCC was {}'.format(maxNCC)) logger.info('True {},{}'.format(true[0], true[1])) logger.info('Pred {},{}'.format( c1_orig_coords, c2_orig_coords)) idx = df.id.values.astype(int) list_centers = list_centers.reshape(-1, 2) df_preds = list_centers[idx - 1] df_true = df[['x', 'y']].values absolute_diff = np.mean(np.abs(df_preds - df_true)) pix_dist = np.mean( np.sqrt((df_preds[:, 0] - df_true[:, 0])**2 + (df_preds[:, 1] - df_true[:, 1])**2)) dist = compute_euclidean_distance(df_preds, df_true) curr_fold_dist.append(dist) curr_fold_pix.append(pix_dist) logger.info( '======== Test Feature {} ======='.format(label_file)) logger.info('Pixel distance is {}'.format(pix_dist)) logger.info('Euclidean distance in mm {}'.format(dist)) logger.info('Mean absolute difference in pixels {}'.format( absolute_diff)) pred_df = pd.DataFrame() pred_df['idx'] = range(1, len(list_centers) + 1) pred_df['c1'] = list_centers[:, 0] pred_df['c2'] = list_centers[:, 1] pred_df.to_csv(os.path.join(checkpoint_dir, '{}.txt'.format(label_file)), header=False, index=False) eucl_dist_per_fold = np.append(eucl_dist_per_fold, np.mean(curr_fold_dist)) pixel_dist_per_fold = np.append(pixel_dist_per_fold, np.mean(curr_fold_pix)) logger.info('EUCLIDEAN DISTANCE CURRENT FOLD {}'.format( eucl_dist_per_fold[-1])) logger.info('PIXEL DISTANCE CURRENT FOLD {}'.format( pixel_dist_per_fold[-1])) logger.info('================= END RESULTS =================') logger.info('Mean euclidean distance in mm {} (std {})'.format( np.mean(eucl_dist_per_fold), np.std(eucl_dist_per_fold)))
args = get_args() setup_seed(args.seed) device = args.device checkpoint_dir = args.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) data = np.load(args.dataset_path) model = net().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) dataLoader = DataLoader(data['X'], data['Y'], train_val_split=[0.7, 0.15, 0.15], batch_size=args.batch_size, device=device) loader_train, loader_val, loader_test = dataLoader.get_loader() trainer = Trainer(model, optimizer) loss_fn = torch.nn.functional.cross_entropy trainer.train_with_val(loss_fn, loader_train=loader_train, loader_val=loader_val, epochs=args.epochs, save_path=checkpoint_dir + 'model.pth', save_best_only=True, monitor_on='acc') trainer.test(loader_test, loss_fn, info='Test ')
def learn_test(expr): loader = DataLoader() dataset = loader.loadData( dataset=expr) # dataset options: electricity, traffic, BLE pastObserve = pastObserves[expr] o_columns = dataset.columns predCol = o_columns[-1] lenAll = len(dataset) lenx = int(lenAll * .75) test_orig = [] mean_errors = [] error_stds = [] all_errors = [] all_predictions = [] values = dataset.values origData = values # normalize parameters = dataset.values.shape[1] scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) reframed = series_to_supervised(scaled, pastObserve, 1) # drop columns we don't want to predict droppings = [] for i in range(1, pastObserve + 1): x = [a for a in range(parameters * (i - 1), parameters * i - 1)] droppings.extend(x) reframed.drop(reframed.columns[droppings], axis=1, inplace=True) valuesTrans = reframed.values test = valuesTrans # split into input and outputs train_X_all, train_y_all = valuesTrans[:, :-1], valuesTrans[:, -1] test_X, test_y = test[:, :-1], test[:, -1] trainingModels = [] for i in range(modelsNo): deepModel = create_model(parameters, pastObserve) trainingModels.append(deepModel) dy = 0 sparsity = 3 for model in trainingModels: # fit network partsLen = int(len(train_X_all) / sparsity) * sparsity a = np.arange(partsLen) a = a.reshape(sparsity, int(partsLen / sparsity)) ixs = [] # just consider part of dataset not all of that for t in range(sparsity): if (t == dy): ixs.append(a[t]) # ixs.append(a[t+1]) # for considering 40% sparsity # ixs.append(a[t+2]) # for considering 60% sparsity ixs = np.array(ixs) train_ixs = ixs.flatten() train_X, train_y = train_X_all[train_ixs], train_y_all[train_ixs] model.fit(train_X, train_y, epochs=20, batch_size=20, verbose=2) dy += 1 # calculate predictions predictions = model.predict(test_X) predictions = predictions.reshape((len(predictions), 1)) pads = np.zeros(len(test_y) * (parameters - 1)) pads = pads.reshape(len(test_y), parameters - 1) inv_yhat = concatenate((pads, predictions), axis=1) inv_yhat = scaler.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, -1] inv_yhat = np.around(inv_yhat, decimals=2) # invert scaling for actual test_y = test_y.reshape((len(test_y), 1)) inv_test = concatenate((test_X[:, pastObserve:], test_y), axis=1) test_orig = scaler.inverse_transform(inv_test) origY = test_orig[:, -1] meanErr, std, errors = report_errors(origY, inv_yhat, errorType[expr]) mean_errors.append(meanErr) error_stds.append(std) all_errors.append(errors) all_predictions.append(inv_yhat) print(min(origY), max(origY)) print(min(inv_yhat), max(inv_yhat)) print('Test Mean Error: %.3f ' % meanErr) p_cols = [] df = DataFrame(test_orig, columns=o_columns) for k in range(len(all_predictions)): colName = 'predict_' + str(k + 1) p_cols.append(colName) df[colName] = all_predictions[k] for k in range(len(all_predictions)): errName = 'error_' + str(k + 1) df[errName] = all_errors[k] print(errorType[expr]) print(mean_errors) if not os.path.exists(models_output_folder): os.makedirs(models_output_folder) outDetails_filename = models_output_folder + 'predictions_details_%s.csv' % expr out_filename = models_output_folder + 'predictions_output_%s.csv' % expr df.to_csv(outDetails_filename, index=False) models_prediction_cols = p_cols models_prediction_cols.append(predCol) df_modelOutput = df[models_prediction_cols] df_modelOutput.to_csv(out_filename, index=False)
def train(): parser = OptionParser() parser.add_option("--train_inputDir", dest="train_inputDir", help="Input directory", metavar="DIRECTORY") parser.add_option("--train_inputFile", dest="train_inputFile", help="Input file", metavar="FILE") parser.add_option("--train_type", dest="train_type", help="Training type, 1|2|3|4.", metavar="VALUE", default=2) parser.add_option("--particle_number", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1) parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1) parser.add_option( "--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--validation_ratio", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1) parser.add_option( "--model_retrain", action="store_true", dest="model_retrain", help= "train the model using the pre-trained model as parameters initialization .", default=False) parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE") parser.add_option("--model_save_dir", dest="model_save_dir", help="save the model to this directory", metavar="DIRECTORY", default="../trained_model") parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE") parser.add_option("--pos_list", dest="pos_list", help="", metavar="VALUE", default="") parser.add_option("--neg_list", dest="neg_list", help="", metavar="VALUE", default="") parser.add_option("--mixup", dest="mixup", help="", metavar="VALUE", default="0") (opt, args) = parser.parse_args() model_input_size = [128, 64, 64, 1] num_class = 2 batch_size = model_input_size[0] # define input parameters train_type = int(opt.train_type) train_inputDir = opt.train_inputDir train_inputFile = opt.train_inputFile protein_number = len(train_inputFile.split(';')) train_number = float(opt.train_number) mrc_number = int(opt.mrc_number) dropout_rate = 0.5 coordinate_symbol = opt.coordinate_symbol debug_dir = '../train_output' # output dir particle_size = int(opt.particle_size) validation_ratio = float(opt.validation_ratio) # define the save model model_retrain = opt.model_retrain model_load_file = opt.model_load_file model_save_dir = opt.model_save_dir model_save_file = os.path.join(model_save_dir, opt.model_save_file) pos_list = opt.pos_list neg_list = opt.neg_list mixup = int(opt.mixup) print("MIXUP=======", mixup) if not os.access(model_save_dir, os.F_OK): os.mkdir(model_save_dir) if not os.access(debug_dir, os.F_OK): os.mkdir(debug_dir) dataLoader = DataLoader() train_number = int(train_number) if train_type == 1: # load train data from mrc file dir train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_mrcFileDir( train_inputDir, particle_size, model_input_size, validation_ratio, coordinate_symbol, mrc_number, train_number) elif train_type == 2: # load train data from numpy data struct train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_ExtractedDataFile( train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number) elif train_type == 3: # load train data from prepicked results train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_PrePickedResults( train_inputDir, train_inputFile, particle_size, model_input_size, validation_ratio, train_number) elif train_type == 4: # load train data from relion .star file train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile( train_inputFile, particle_size, model_input_size, validation_ratio, train_number) elif train_type == 5: # load train data from class2d .star file train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_Class2dStarFile( train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number) elif train_type == 6: left = 0 right = 50 get_partition = lambda x, y: (x + y) / 2 ''' # load train data from auto_filter_class .star file train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile( train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number) ''' else: print("ERROR: invalid value of train_type:", train_type) try: train_type == 6 or train_data except NameError: print("Error: in function load.loadInputTrainData.") return None else: print("Load training data successfully!") idx = 0 good_enough = False while True and not good_enough: best_eval_error_rate = 100 all_error = [] finetune = False if train_type == 6 else False dropout_rate = 0.5 if train_type == 6 else dropout_rate deepModel = DeepModel(particle_size, model_input_size, num_class, dropout_rate=dropout_rate, finetune=finetune) if train_type == 6: deepModel.learning_rate = deepModel.learning_rate / 10.0 deepModel.decay_steps *= 2 if good_enough: partition = partition + 1 else: partition = get_partition(left, right) print "PARTITOIN --->>>", partition partition = 9 good_enough = True #Set this=True to run while for just once!!! #train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number, partition) train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_AutoClass2dStarFile( train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number, partition, pos_list, neg_list) train_data, train_label = shuffle_in_unison_inplace( train_data, train_label) print("label_shape = ", np.array(train_label).shape) ''' mix_data, mix_label = [], [] if mixup: mixnum = len(train_data) #for cnt in tqdm(range(mixnum)): for cnt in range(mixnum): #for cnt in range(mixnum): L = np.random.beta(0.2, 0.2) i1, i2 = np.random.randint(mixnum, size=2) if train_data[i1].shape[1] == train_data[i2].shape[1]: new_data = (1-L) * train_data[i1] + L * train_data[i2] new_label = (1-L) * train_label[i1][1] + L * train_label[i2][1] mix_data.append(new_data) mix_label.append([1.0-new_label, new_label]) train_data = train_data + mix_data train_label = train_label + mix_label ''' print("label_shape = ", np.array(train_label).shape) #eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label) bs2train = {} bs2eval = {} for idx, t in enumerate(train_data): if t.shape[1] not in bs2train.keys(): bs2train[t.shape[1]] = [idx] else: bs2train[t.shape[1]].append(idx) for idx, t in enumerate(eval_data): if t.shape[1] not in bs2eval.keys(): bs2eval[t.shape[1]] = [idx] else: bs2eval[t.shape[1]].append(idx) train_size = len(train_data) eval_size = len(eval_data) print("train size=%d, eval_size=%d" % (train_size, eval_size)) print("batch_size=%d" % batch_size) print("dropout=%.2f" % dropout_rate) if train_size < 1000: print("NOTE: no enough training data!\n<Failed>! ") exit() ''' if eval_size < model_input_size[0]: #TODO tile_size = model_input_size[0] // eval_size + 1 eval_data = np.array(eval_data) eval_data = np.tile(eval_data, [tile_size,1,1,1]) print ("tiled eval_data !!!!", tile_size) ''' saver = tf.train.Saver(tf.all_variables(), max_to_keep=30) start_time = time.time() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.26) train_error = [] valid_error = [] eval_time = 0 with tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) as sess: tf.initialize_all_variables().run() if model_load_file: print model_load_file saver.restore(sess, model_load_file) max_epochs = 200 best_eval_error_rate = 100 toleration_patience = 10 toleration_patience_flag = 0 eval_frequency = train_size // batch_size print("total_step=%d" % (int(max_epochs * train_size) // batch_size)) #fout = open('trainingcurve%d_%s_test2_block1_lr0.1.txt'%(protein_number, deepModel.arch), 'w') #fout = open('trainingcurve%d_%s_lr0.1.txt'%(protein_number, deepModel.arch), 'w') #fout = open('trainingcurve%d_resnet.txt'%protein_number, 'w') idx += 1 batch_type = bs2train.keys() batch_type_number = len(batch_type) po = {} for k in range(batch_type_number): po[k] = 0 batch_type_idx = 0 train_error_list = [] print( "===================================================================" ) #for step in xrange(int(max_epochs * train_size) // batch_size): eval_prediction = deepModel.evaluation(eval_data, sess, label=eval_label) eval_error_rate = error_rate(eval_prediction, eval_label) eval_before_retrain = eval_error_rate print('valid error before training: %.6f%%' % eval_error_rate) print( "===================================================================" ) for epoch in range(int(max_epochs)): start_time = time.time() #for s in tqdm(range(eval_frequency)): for s in range(eval_frequency): step = epoch * eval_frequency + s # get the batch training data offset = (step * batch_size) % (train_size - batch_size) batch_type_idx = (batch_type_idx + 1) % batch_type_number batch = batch_type[batch_type_idx] if po[batch_type_idx] + batch_size >= len(bs2train[batch]): po[batch_type_idx] = 0 p = po[batch_type_idx] idxs = bs2train[batch][p:(p + batch_size)] batch_data = [] batch_label = [] for ix in idxs: batch_data.append(train_data[ix]) batch_label.append(train_label[ix]) po[batch_type_idx] = po[batch_type_idx] + batch_size #batch_data = train_data[offset:(offset+batch_size)] #batch_label = train_label[offset:(offset+batch_size)] ''' batch_data_shape = batch_data[0].shape con = False for bb in batch_data: if bb.shape != batch_data_shape: con = True break if con: continue ''' # online augmentation #batch_data = DataLoader.preprocess_particle_online(batch_data) loss_value, lr, train_prediction = deepModel.train_batch( batch_data, batch_label, sess) train_error_list.append( error_rate(train_prediction, batch_label)) # do the computation #if step % eval_frequency == 0: #if step % 50 == 0: #TODO:display after each epoch stop_time = time.time() - start_time eval_prediction = deepModel.evaluation(eval_data, sess, label=eval_label) eval_error_rate = error_rate(eval_prediction, eval_label) #best_eval_error_rate = min(best_eval_error_rate, eval_error_rate) #print('>> epoch: %.2f , %.2f ms' % (step * batch_size /train_size, 1000 * stop_time / eval_frequency)) train_error_mean = np.mean(train_error_list) print( '>> epoch: %d, train loss: %.2f, lr: %.6f, toleration:%d, train error: %.2f%%, valid error: %.2f%%' % (epoch, loss_value, lr, toleration_patience, train_error_mean, eval_error_rate)) #print >>fout, step, train_error_mean, eval_error_rate train_error.append(train_error_mean) valid_error.append(eval_error_rate) eval_time += 1 train_error_list = [] all_error.append(eval_error_rate) if eval_error_rate < best_eval_error_rate: best_eval_error_rate = eval_error_rate toleration_patience = 10 saver.save(sess, model_save_file) else: if epoch > 50: toleration_patience = toleration_patience - 1 if toleration_patience == 0: break good_enough = True ''' import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.title('Training curve') plt.ylabel('Error(%)') plt.xlabel('Epoch') axes = plt.gca() axes.set_ylim([0, 60]) plt.plot(range(eval_time), train_error, label='training') plt.plot(range(eval_time), valid_error, label='validation') plt.legend(loc='upper right') plt.show() #plt.savefig('pickercurve.png') ''' print("Accuracy: before retrain: %.2f%%, after retrain: %.2f%%" % (100.0 - eval_before_retrain, 100.0 - best_eval_error_rate)) print("Retrain <Successful>!")
from dataLoader import DataLoader #Global variables to be accessed between files. global data_loader data_loader = DataLoader() global locations locations = data_loader.setup_locations() global distances distances = data_loader.setup_distances() global all_packages all_packages = data_loader.setup_packages(locations)
def starcraft_svm_test(): # Create DataLoader instance to load and format data dataLoader = DataLoader() logging.info("Program started") logging.info("Loading starcraft data") # Read skillcraft dataset, the class index is the second column dataLoader.read(filename="data/SkillCraft1_Dataset.csv", classIndex=1, numOfFeatures=15) multi_label_count = dataLoader.labelCount(8) # Creates plots for a few of the data features # dataLoader.visualize() # Normalize data values from 0 - 1 #dataLoader.normalize() # Create new labels to fit into binary classification dataLoader.scaleToBinary(5) label_count = dataLoader.binaryLabelCount(5) logging.info("Number of examples per class") logging.info("Casual - (1): " + str(label_count[0])) logging.info("Hardcore - (-1): " + str(label_count[1])) label_count = dataLoader.labelCount(8) logDataCount(label_count) """ # Create SVM svm = SVM() # Train and predict for binary svm logging.info("Running SVM for binary classification") # Train for binary single run with these objects logging.info("Single binary SVM") svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test) # Train and test binary svm multiple times for all available binary variables logging.info("Multiple runs with different parameters - binary SVM") svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, iterate=True) # Save binary results to excel sheet logging.info("Saving binary SVM results") svm.results.to_excel(writer_starcraft, sheet_name='binary-svm') # MULTI CLASS SVM logging.info("Running SVM for multiclass classification") # Train and predict for multi-class data using the linear svm from liblinear implementation logging.info("Running SVM for multiclass classification with liblinear implementation") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False) logging.info("Saving multiclass liblinear results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-liblinear') # Train for multi-class single run with these objects using the libsvm implementation logging.info("Running SVM for multiclass classification with libsvm implementation") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False, linear=False) logging.info("Saving multiclass libsvm results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-libsvm') # Train and test multi-class svm multiple times for all available multi-class variables logging.info("Running SVM for multiclass classification for all available multi-class variables") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False) logging.info("Saving multiclass multiple-runs results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-multiple-variables') # Train and test multi-class svm multiple times with KPCA-LDA logging.info("Running SVM for multiclass classification with KPCA-LDA") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False, decomposition=True) logging.info("Saving multiclass multiple-runs results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-kpca-lda') # KNN and NC nearest(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, dataLoader.multi_y_train, dataLoader.multi_y_test, writer_starcraft) """ clustering(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test) # Write all the results writer_starcraft.save()
def Run_SRNN_NormalCase(args, no_dataset): data_path, graph_path = Data_path(no_dataset) log_path = Log_path(no_dataset) # Construct the DataLoader object that loads data dataloader = DataLoader(args) dataloader.load_data(data_path) # Construct the ST-graph object that reads graph stgraph = ST_GRAPH(args) stgraph.readGraph(dataloader.num_sensor, graph_path) # Initialize net net = SRNN(args) net.setStgraph(stgraph) print('- Number of trainable parameters:', sum(p.numel() for p in net.parameters() if p.requires_grad)) # optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) # optimizer = torch.optim.RMSprop(net.parameters(), lr=args.learning_rate, momentum=0.0001, centered=True) optimizer = torch.optim.Adagrad(net.parameters()) best_eval_loss = 10000 best_epoch = 0 print('') print('---- Train and Evaluation ----') eval_loss_res = np.zeros((args.num_epochs + 1, 2)) for e in range(args.num_epochs): epoch = e + 1 #### Training #### print('-- Training, epoch {}/{}'.format(epoch, args.num_epochs)) loss_epoch = 0 # For each batch for b in range(dataloader.num_batches_train): batch = b + 1 start = time.time() # Get batch data x = dataloader.next_batch_train() # Loss for this batch loss_batch = 0 # For each sequence in the batch for sequence in range(dataloader.batch_size): # put node and edge features stgraph.putSequenceData(x[sequence]) # get data to feed data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData( ) # put a sequence to net loss_output, data_nodes, outputs = forward( net, optimizer, args, stgraph, data_nodes, data_temporalEdges, data_spatialEdges) loss_output.backward() loss_batch += loss_RMSE(data_nodes[-1], outputs[-1], dataloader.scaler) # Clip gradients torch.nn.utils.clip_grad_norm_(net.parameters(), args.grad_clip) # Update parameters optimizer.step() end = time.time() loss_batch = loss_batch / dataloader.batch_size loss_epoch += loss_batch print('Train: {}/{}, train_loss = {:.3f}, time/batch = {:.3f}'. format(e * dataloader.num_batches_train + batch, args.num_epochs * dataloader.num_batches_train, loss_batch, end - start)) # Compute loss for the entire epoch loss_epoch /= dataloader.num_batches_train print('(epoch {}), train_loss = {:.3f}'.format(epoch, loss_epoch)) # Save the model after each epoch save_path = Save_path(no_dataset, epoch) print('Saving model to ' + save_path) torch.save( { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, save_path) #### Evaluation #### print('-- Evaluation, epoch {}/{}'.format(epoch, args.num_epochs)) loss_epoch = 0 for b in range(dataloader.num_batches_eval): batch = b + 1 start = time.time() # Get batch data x = dataloader.next_batch_eval() # Loss for this batch loss_batch = 0 for sequence in range(dataloader.batch_size): # put node and edge features stgraph.putSequenceData(x[sequence]) # get data to feed data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData( ) # put a sequence to net _, data_nodes, outputs = forward(net, optimizer, args, stgraph, data_nodes, data_temporalEdges, data_spatialEdges) loss_batch += loss_RMSE(data_nodes[-1], outputs[-1], dataloader.scaler) end = time.time() loss_batch = loss_batch / dataloader.batch_size loss_epoch += loss_batch print( 'Eval: {}/{}, eval_loss = {:.3f}, time/batch = {:.3f}'.format( e * dataloader.num_batches_eval + batch, args.num_epochs * dataloader.num_batches_eval, loss_batch, end - start)) loss_epoch /= dataloader.num_batches_eval eval_loss_res[e] = (epoch, loss_epoch) # Update best validation loss until now if loss_epoch < best_eval_loss: best_eval_loss = loss_epoch best_epoch = epoch print('(epoch {}), eval_loss = {:.3f}'.format(epoch, loss_epoch)) # Record the best epoch and best validation loss overall print('Best epoch: {}, Best evaluation loss {:.3f}'.format( best_epoch, best_eval_loss)) eval_loss_res[-1] = (best_epoch, best_eval_loss) np.savetxt(log_path, eval_loss_res, fmt='%d, %.3f') print('- Eval result has been saved in ', log_path) print('')
def train(): parser = OptionParser() parser.add_option("--train_good", dest="train_good", help="Input good particles ", metavar="FILE") parser.add_option("--train_bad", dest="train_bad", help="Input bad particles", metavar="FILE") parser.add_option("--particle_number", type="int", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1) parser.add_option("--bin_size", type="int", dest="bin_size", help="image size reduction", metavar="VALUE", default=3) parser.add_option( "--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", type="int", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--validation_ratio", type="float", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1) parser.add_option( "--model_retrain", action="store_true", dest="model_retrain", help= "train the model using the pre-trained model as parameters initialization .", default=False) parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE") parser.add_option("--logdir", dest="logdir", help="directory of logfiles", metavar="DIRECTORY", default="Logfile") parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE") (opt, args) = parser.parse_args() np.random.seed(1234) # define the input size of the model model_input_size = [100, 64, 64, 1] num_classes = 2 # the number of output classes batch_size = model_input_size[0] if not os.access(opt.logdir, os.F_OK): os.mkdir(opt.logdir) # load training dataset dataLoader = DataLoader() train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile( opt.train_good, opt.particle_size, model_input_size, opt.validation_ratio, opt.train_number, opt.bin_size) # Check if train_data exist try: train_data except NameError: print("ERROR: in function load.loadInputTrainData.") return None else: print("Load training data successfully!") # shuffle training data train_data, train_label = shuffle_in_unison_inplace( train_data, train_label) eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label) train_x = train_data.reshape(train_data.shape[0], 64, 64, 1) test_x = eval_data.reshape(eval_data.shape[0], 64, 64, 1) print("shape of training data: ", train_x.shape, test_x.shape) train_y = to_categorical(train_label, 2) test_y = to_categorical(eval_label, 2) print(train_y.shape, test_y.shape) datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True, rotation_range=20, width_shift_range=0.0, height_shift_range=0.0, horizontal_flip=True, vertical_flip=True) datagen.fit(train_x) model = Sequential() model.add( Conv2D(32, kernel_size=(8, 8), strides=(1, 1), activation='relu', input_shape=(64, 64, 1))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(64, kernel_size=(8, 8), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(128, kernel_size=(3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dense(num_classes, activation='softmax')) for layer in model.layers: print(layer.name, layer.output_shape) logdir = opt.logdir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = TensorBoard(log_dir=logdir) checkpoint = ModelCheckpoint('best_model.h5', monitor='val_acc', verbose=1, save_best_only=True, period=1) reduce_lr_plateau = ReduceLROnPlateau(monitor='val_acc', patience=10, verbose=1) callbacks = [checkpoint, reduce_lr_plateau, tensorboard_callback] model.compile(optimizer=SGD(0.01), loss="binary_crossentropy", metrics=["accuracy"]) model.fit_generator(datagen.flow(train_x, train_y, batch_size=batch_size), steps_per_epoch=len(train_x) / 32, epochs=30, validation_data=(test_x, test_y), callbacks=callbacks) model.save(opt.model_save_file) accuracy = model.evaluate(x=test_x, y=test_y, batch_size=batch_size) print("Accuracy:", accuracy[1])
from myModel import MyModel from dataLoader import DataLoader if __name__ == '__main__': ENABLE_SAVE_MODEL = True MODEL_NAME = 'mini' # 4 mnist # H, W, C = 28, 28, 1 # (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() # 4 ukiyoe DATA_PATH = './data/' H, W, C = 224, 224, 3 RH, RW = 224, 224 x_train, y_train, x_test, y_test = DataLoader(0.2).load(DATA_PATH) if C == 1: x_train = np.sum(x_train, axis=-1) / 3 x_test = np.sum(x_test, axis=-1) / 3 x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 x_train = x_train.reshape(x_train.shape[0], H, W, C) x_test = x_test.reshape(x_test.shape[0], H, W, C) model = MyModel() loss_object = tf.keras.losses.SparseCategoricalCrossentropy() optimizer = tf.keras.optimizers.Adam() train_loss = tf.keras.metrics.Mean(name='train_loss')
'loss': loss, }, os.path.join(args.exp_dir , 'unfinished_model.pt')) epoch += 1 cost_time = time.time() - since print ('Training complete in {:.0f}m {:.0f}s'.format(cost_time//60,cost_time%60)) print ('Best Train Acc is {:.4f}'.format(best_train_acc)) print ('Best Val Acc is {:.4f}'.format(best_acc)) model.load_state_dict(best_model) return model,cost_time,best_acc,best_train_acc if __name__ == '__main__': print ('DataSets: '+args.dataset) print ('ResNet Depth: '+str(args.depth)) loader = DataLoader(args.dataset,batch_size=args.batch_size) dataloaders,dataset_sizes = loader.load_data() num_classes = 10 if args.dataset == 'cifar-10': num_classes = 10 if args.dataset == 'cifar-100': num_classes = 100 model = resnet_cifar(depth=args.depth, num_classes=num_classes) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=1e-4) # define loss and optimizer criterion = nn.CrossEntropyLoss() scheduler = MultiStepLR(optimizer, milestones=[args.epoch*0.4, args.epoch*0.6, args.epoch*0.8], gamma=0.1)
update_checkpoint_link([('epoch_%d.pt' % best_epoch, 'best.pt'), ('epoch_%d.pt' % epoch, 'last.pt')]) epoch += 1 cost_time = time.time() - since print('Training complete in {:.0f}h{:.0f}m{:.0f}s'.format( (cost_time // 60) // 60, (cost_time // 60) % 60, cost_time % 60)) return model, cost_time, best_acc, best_train_acc if __name__ == '__main__': loader = DataLoader(args.dataset, batch_size=args.batch_size, seed=args.seed) dataloaders, dataset_sizes = loader.load_data(args.img_size) num_classes = 10 if args.dataset == 'cifar-10': num_classes = 10 if args.dataset == 'cifar-100': num_classes = 100 if args.dataset == 'VOCpart': num_classes = len(dataloaders['train'].dataset.classes) assert args.img_size == 128, 'only supports --img_size 128' model = resnet_std(depth=args.depth, num_classes=num_classes, ifmask=args.ifmask,