def run_benchmark(self, output_file_path): """ Run benchmark for all learners on every dataset provided. The result will be output to the provided file. """ dataloader = DataLoader() f = open(output_file_path,'w') # score, learner_1_name, learner_2_name, ..., learner_n_name f.write("dataset") for learner in self.learners: f.write(", " + learner.name()) # write scores for all data sets for dataset_id in self.dataset_ids: print "Benchmarking dataset: " + str(dataset_id) f.write("\n" + str(dataset_id)) train_data = dataloader.load_sequences_from_file("../data/" + str(dataset_id) + ".pautomac" + ".train") test_data = dataloader.load_sequences_from_file("../data/" + str(dataset_id) + ".pautomac" + ".test") solution_data = dataloader.load_probabilities_from_file("../data/" + str(dataset_id) + ".pautomac_solution" + ".txt") for learner in self.learners: print "Training learner: " + learner.name() learner.train(train_data, test_data) print "Evaluating learner: " + learner.name() score = learner.evaluate(test_data, solution_data) print "Achieved score: " + str(score) str_score = " {0:.1f}".format(score) while len(str_score) < 8: str_score = " " + str_score f.write(", " + str_score) f.close()
def extractData(): parser = OptionParser() parser.add_option("--inputDir", dest="inputDir", help="Input directory", metavar="DIRECTORY") parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1) parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--save_dir", dest="save_dir", help="save the training samples to this directory", metavar="DIRECTORY", default="../trained_model") parser.add_option("--save_file", dest="save_file", help="save the training samples to file", metavar="FILE") (opt, args) = parser.parse_args() inputDir = opt.inputDir particle_size = int(opt.particle_size) coordinate_symbol = opt.coordinate_symbol mrc_number = int(opt.mrc_number) output_dir = opt.save_dir output_filename = opt.save_file if not os.path.isdir(output_dir): os.mkdir(output_dir) if particle_size == -1: print("particle size should be a positive value!") return output_filename = os.path.join(output_dir, output_filename) DataLoader.extractData(inputDir, particle_size, coordinate_symbol, mrc_number, output_filename)
def load_data(filePath: str, label_txt_filePath: str, shuffle: bool = True, seq_length: int = 3000, batch_size: int = 64, training: bool = True): voc = Vocab() dataLoader = DataLoader() # 全部数据 dataLoader.sequences = dataLoader.read_fasta_file(fasta_file_path=filePath) # 训练集 dataLoader.train_seq = dataLoader.sequences[:900] # 测试集 dataLoader.test_seq = dataLoader.sequences[900:1000] # 标签,0/1 dataLoader.labels = dataLoader.read_label_txt( label_file_path=label_txt_filePath) # 训练集的向量表示 dataLoader.train_vectorized_seq = voc.sequences_to_ids( dataLoader.train_seq) # 测试集的向量表示 dataLoader.test_vectorized_seq = voc.sequences_to_ids(dataLoader.test_seq) # print(dataLoader.train_vectorized_seq) # print(dataLoader.test_vectorized_seq) # x_batch, y_batch = dataLoader.get_batch(shuffle=shuffle, seq_length=seq_length, batch_size=batch_size, training=training) # print("x_batch.shape={}, y_batch.shape={}".format(x_batch.shape, y_batch.shape)) # print("x_batch[0]:{}".format(x_batch[0])) # print("y_batch[0]:{}".format(y_batch[0])) return voc, dataLoader
def _reading_data(): print(config.USER) # step2 the way to load_data # load data contains : # the way to load data # the way to preprocess with data # doing some special data cleaning process trainFilepath = os.path.join(os.getcwd(), "data", config.FILENAME) trainDataLoader = DataLoader(trainFilepath) train_data = trainDataLoader.load_data(useSpark=False, interactive=False) train_data.save_data(os.getcwd())
def main(): ranker = SVMRank() file_name = 'input/BioASQ-trainingDataset6b.json' data = DataLoader(file_name) data.load_ner_entities() questions = data.get_questions_of_type(C.FACTOID_TYPE)[:419] for i, question in enumerate(questions): ranked_sentences = question.ranked_sentences() X, y = get_features(question, ranked_sentences) ranker.feed(X, y, i) ranker.train_from_feed() ranker.save('weights_2')
def main(): file_name = 'input/BioASQ-task6bPhaseB-testset3.json' file_name = 'input/BioASQ-trainingDataset6b.json' file_name = 'input/BioASQ-trainingDataset5b.json' file_name = 'input/phaseB_5b_05.json' save_model_file_name = 'weights_2' ranker = SVMRank(save_model_file_name) data = DataLoader(file_name) data.load_ner_entities() ans_file = 'output/factoid_list_%s.json' % data.name questions = data.get_questions_of_type(C.FACTOID_TYPE) for i, question in enumerate(tqdm(questions)): ranked_sentences = question.ranked_sentences() X, candidates = get_only_features(question, ranked_sentences) top_answers = ranker.classify_from_feed(X, candidates, i) question.exact_answer = [[answer] for answer in top_answers[:5]] # question.exact_answer = [answer for answer in top_answers] # print question.exact_answer_ref # print '\n' # print top5 # print '\n' # print '\n\n\n' questions = data.get_questions_of_type(C.LIST_TYPE) for i, question in enumerate(tqdm(questions)): ranked_sentences = question.ranked_sentences() X, candidates = get_only_features(question, ranked_sentences) top_answers = ranker.classify_from_feed(X, candidates, i) question.exact_answer = [[answer] for answer in top_answers[:10]] data.save_factoid_list_answers(ans_file)
def run_cv(fold_iterator, logger, params_dict, upsample=True): for traindirs, testdirs in fold_iterator: # TRAIN LOCAL PREDICTION MODEL # Generators logger.info('############ FOLD #############') logger.info('Training folders are {}'.format(traindirs)) training_generator = DataLoader(data_dir, traindirs, 32, width_template=params_dict['width'], upsample=upsample) validation_generator = DataLoader(data_dir, testdirs, 32, width_template=params_dict['width'], type='val', upsample=upsample) # Design model model = create_model(params_dict['width'] + 1, params_dict['h1'], params_dict['h2'], params_dict['h3'], embed_size=params_dict['embed_size'], drop_out_rate=params_dict['dropout_rate'], use_batch_norm=params_dict['use_batchnorm']) # Train model on training dataset ''' model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=6) ''' try: model.load_weights(os.path.join(checkpoint_dir, 'model22.h5')) except OSError: print('here') model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, epochs=params_dict['n_epochs'], workers=4, max_queue_size=20) model.save_weights(os.path.join(checkpoint_dir, 'model.h5')) metrics = model.evaluate_generator(generator=validation_generator, workers=4, max_queue_size=20) logger.info(metrics)
def extractData(): parser = OptionParser() parser.add_option("--inputDir", dest="inputDir", help="Input directory", metavar="DIRECTORY") parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1) parser.add_option( "--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--save_dir", dest="save_dir", help="save the training samples to this directory", metavar="DIRECTORY", default="../trained_model") parser.add_option("--save_file", dest="save_file", help="save the training samples to file", metavar="FILE") (opt, args) = parser.parse_args() inputDir = opt.inputDir particle_size = int(opt.particle_size) coordinate_symbol = opt.coordinate_symbol mrc_number = int(opt.mrc_number) output_dir = opt.save_dir output_filename = opt.save_file if not os.path.isdir(output_dir): os.mkdir(output_dir) if particle_size == -1: print("particle size should be a positive value!") return output_filename = os.path.join(output_dir, output_filename) DataLoader.extractData(inputDir, particle_size, coordinate_symbol, mrc_number, output_filename)
def get_data_loaders(data_root, speaker_id, test_shuffle=True): data_loaders = {} local_conditioning = hparams.cin_channels > 0 for phase in ["train", "test"]: train = phase == "train" X = FileSourceDataset( RawAudioDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("[{}]: length of the dataset is {}".format(phase, len(X))) if train: lengths = np.array(X.file_data_source.lengths) # Prepare sampler sampler = PartialyRandomizedSimilarTimeLengthSampler( lengths, batch_size=hparams.batch_size) shuffle = False else: sampler = None shuffle = test_shuffle dataset = PyTorchDataset(X, Mel) data_loader = DataLoader(dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle, collate_fn=collate_fn, pin_memory=hparams.pin_memory) speaker_ids = {} for idx, (x, c, g) in enumerate(dataset): if g is not None: try: speaker_ids[g] += 1 except KeyError: speaker_ids[g] = 1 if len(speaker_ids) > 0: print("Speaker stats:", speaker_ids) data_loaders[phase] = data_loader return data_loaders
def _build_data(self, data_dir='train_dir', num_classes=10, mode='train'): loader = DataLoader(data_dir=data_dir, num_classes=num_classes, mode=mode, height=self.height, width=self.width) dataset = tf.data.Dataset.from_generator(generator=loader.generator, output_types=(tf.float32, tf.int32), output_shapes=(tf.TensorShape([self.height, self.width, 3]), tf.TensorShape([self.num_classes]))) return dataset
def prepare_data_loader_train_10_splits(texture_train_data_set_path, texture_train_label_set_path, texture_val_data_set_path, texture_val_label_set_path, texture_batch_size, num_workers, device): data_loader_list = [] for i in range(10): idx = i + 1 print("Split: {0}".format(idx)) texture_train_data_set_path = texture_train_data_set_path.format(idx) texture_train_label_set_path = texture_train_label_set_path.format(idx) texture_val_data_set_path = texture_val_data_set_path.format(idx) texture_val_label_set_path = texture_val_label_set_path.format(idx) dL = DataLoader() texture_train_set, train_set_size = dL.get_tensor_set( texture_train_data_set_path, texture_train_label_set_path, device) texture_val_set, val_set_size = dL.get_tensor_set( texture_val_data_set_path, texture_val_label_set_path, device) print("Train set size: {0}".format(train_set_size)) print("Val set size: {0}".format(val_set_size)) texture_train_data_loader = torch.utils.data.DataLoader( texture_train_set, batch_size=texture_batch_size, shuffle=True, num_workers=num_workers) texture_val_data_loader = torch.utils.data.DataLoader(texture_val_set, num_workers=1, shuffle=False, pin_memory=True) data_loader_dict = { "train": texture_train_data_loader, "val": texture_val_data_loader } data_loader_list.append(data_loader_dict) return data_loader_list
def starcraft_sp_test(): # Create DataLoader instance to load and format data dataLoader = DataLoader() logging.info("Program started") logging.info("Loading starcraft data") # Read skillcraft dataset, the class index is the second column dataLoader.read(filename="data/SkillCraft1_Dataset.csv", classIndex=1, numOfFeatures=15) # Normalize data values from 0 - 1 #dataLoader.normalize() # Create new labels to fit into binary classification dataLoader.scaleToBinary(5) # Spectral Clustering # Binary clustering(dataLoader.x_train, dataLoader.y_train, writer_starcraft, 'starcraft-binary', multiple=True, binary=True) # Multiclass #clustering(dataLoader.x_train, dataLoader.multi_y_train, writer_starcraft, 'starcraft-multiclass', multiple=True, binary=False) # Write all the results writer_starcraft.save()
def prepare_data_loader_test_10_splits(texture_test_data_set_path, texture_test_label_set_path, device): data_loader_list = [] for i in range(10): idx = i + 1 print("Split: {0}".format(idx)) texture_test_data_set_path = texture_test_data_set_path.format(idx) texture_test_label_set_path = texture_test_label_set_path.format(idx) dL = DataLoader() texture_test_set, test_set_size = dL.get_tensor_set( texture_test_data_set_path, texture_test_label_set_path, device) print("Test set size: {0}".format(test_set_size)) test_data_loader = torch.utils.data.DataLoader(texture_test_set, num_workers=1, shuffle=False, pin_memory=True) data_loader_list.append(test_data_loader) return data_loader_list
def test_real_dataset(create_obj_func, src_name=None, trg_name=None, show=False, block_figure_on_end=False): print('Running {} ...'.format(os.path.basename(__file__))) if src_name is None: if len(sys.argv) > 2: src_name = sys.argv[2] else: raise Exception('Not specify source dataset') if trg_name is None: if len(sys.argv) > 3: trg_name = sys.argv[3] else: raise Exception('Not specify trgget dataset') np.random.seed(random_seed()) tf.set_random_seed(random_seed()) tf.reset_default_graph() print("========== Test on real data ==========") users_params = dict() users_params = parse_arguments(users_params) data_format = 'mat' if 'format' in users_params: data_format, users_params = extract_param('format', data_format, users_params) data_loader = DataLoader(src_domain=src_name, trg_domain=trg_name, data_path=data_dir(), data_format=data_format, cast_data=users_params['cast_data']) assert users_params['batch_size'] % data_loader.num_src_domain == 0 print('users_params:', users_params) learner = create_obj_func(users_params) learner.dim_src = data_loader.data_shape learner.dim_trg = data_loader.data_shape learner.x_trg_test = data_loader.trg_test[0][0] learner.y_trg_test = data_loader.trg_test[0][1] learner._init(data_loader) learner._build_model() learner._fit_loop()
def test(self, data_dir='test_b', model_dir=None, output_dir=None, threshold=0.5): print("testing starts.") loader = DataLoader(data_dir=data_dir, mode='test', height=self.height, width=self.width, label_value=self.label_values) testset = tf.data.Dataset.from_generator( generator=loader.generator, output_types=(tf.string, tf.int32, tf.float32), output_shapes=(tf.TensorShape([]), tf.TensorShape([2]), tf.TensorShape([self.height, self.width, 3]))) testset = testset.batch(1) testset = testset.prefetch(10) test_init = self.it.make_initializer(testset) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, model_dir) sess.run(test_init) queue = multiprocessing.Queue(maxsize=30) writer_process = multiprocessing.Process( target=writer, args=[output_dir, self.label_values, queue, 'stop']) writer_process.start() print('writing predictions...') try: while True: img, path, size, output_image = sess.run( [self.img, self.path, self.size, self.logits]) queue.put(('continue', path, size, img, output_image)) except tf.errors.OutOfRangeError: queue.put(('stop', None, None, None, None)) print('testing finished.')
def test(self, data_dir='test', model_dir=None, output_dir='result', batch_size=10): print("testing starts.") if not os.path.exists(output_dir): os.mkdir(output_dir) # load test data loader = DataLoader(data_dir=data_dir, num_classes=self.num_classes, mode='test', height=self.height, width=self.width) testset = tf.data.Dataset.from_generator(generator=loader.generator, output_types=(tf.string, tf.float32), output_shapes=(tf.TensorShape([]), tf.TensorShape([self.height, self.width, 3]))) testset = testset.shuffle(100) testset = testset.batch(batch_size) testset = testset.prefetch(20) test_init = self.it.make_initializer(testset) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, model_dir) sess.run(test_init) queue = multiprocessing.Queue(maxsize=30) writer_process = multiprocessing.Process(target=writer, args=[output_dir, batch_size, queue, 'stop']) writer_process.start() print('writing predictions...') try: while True: img_name, pre_label = sess.run([self.img_name, self.prediction_value]) queue.put(('continue', img_name, pre_label)) except tf.errors.OutOfRangeError: queue.put(('stop', None, None)) print('testing finished.')
def pick(self, mrc_filename): """Do the picking job through tensorflow. This function read the micrograph data information based on the given filename of micrograph. Then do the auto picking based on pre-trained CNN model. Args: mrc_filename: string, it is the filename of the target micrograph. Returns: return list_coordinate list_coordinate: a list, the length of this list stands for the number of picked particles. Each element in the list is also a list, the length is 4, the first one is y-axis, the second one is x-axis, the third one is the predicted score, the fourth is the micrograph filename. """ # read the micrograph image data print(mrc_filename) header, body = DataLoader.readMrcFile(mrc_filename) num_col = header[0] num_row = header[1] body_2d = np.array(body, dtype = np.float32).reshape(num_row, num_col) # do process to micrograph body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d) # Edge detection to get the ice noise mask # a binary matrix, 1 stands for the ice noise site # mask = edge_detection_ice(body_2d) step_size = 4 candidate_patches = None candidate_patches_exist = False num_total_patch = 0 patch_size = int(self.particle_size/bin_size) # the size to do peak detection local_window_size = int(0.6*patch_size/step_size) #print("image_col:", body_2d.shape[0]) #print("particle_size:", patch_size) #print("step_size:", step_size) map_col = int((body_2d.shape[0]-patch_size)/step_size) map_row = int((body_2d.shape[1]-patch_size)/step_size) #prediction = np.zeros((map_col, map_row), dtype = float) time1 = time.time() particle_candidate_all = [] map_index_col = 0 for col in range(0, body_2d.shape[0]-patch_size+1, step_size): for row in range(0, body_2d.shape[1]-patch_size+1, step_size): # extract the particle patch patch = np.copy(body_2d[col:(col+patch_size), row:(row+patch_size)]) # do preprocess to the particle patch = DataLoader.preprocess_particle(patch, self.model_input_size) particle_candidate_all.append(patch) num_total_patch = num_total_patch + 1 map_index_col = map_index_col + 1 map_index_row = map_index_col-map_col+map_row #print("map_col:",map_col) #print("map_row:",map_row) #print(len(particle_candidate_all)) #print("map_index_col:",map_index_col) #print("map_index_row:",map_index_row) #print("col*row:",map_index_col*map_index_row) # reshape it to fit the input format of the model particle_candidate_all = np.array(particle_candidate_all).reshape(num_total_patch, self.model_input_size[1], self.model_input_size[2], 1) # predict predictions = self.deepModel.evaluation(particle_candidate_all, self.sess) predictions = predictions[:, 1:2] predictions = predictions.reshape(map_index_col, map_index_row) time_cost = time.time() - time1 print("time cost: %d s"%time_cost) #display.save_image(prediction, "prediction.png") # get the prediction value to be a positive sample, it is a value between 0~1 # the following code not tested # do a connected component analysis # prediction = detete_large_component(prediction) # do a local peak detection to get the best coordinate # list_coordinate is a 2D list of shape (number_particle, 3) # element in list_coordinate is [x_coordinate, y_coordinate, prediction_value] list_coordinate = self.peak_detection(predictions, local_window_size) # add the mrc filename to the list of each coordinate for i in range(len(list_coordinate)): list_coordinate[i].append(mrc_filename) # transform the coordinates to the original size list_coordinate[i][0] = (list_coordinate[i][0]*step_size+patch_size/2)*bin_size list_coordinate[i][1] = (list_coordinate[i][1]*step_size+patch_size/2)*bin_size return list_coordinate
from dataLoader import DataLoader loader = DataLoader() loader.loadAll() fileobj = open("csv/subjectAreaDump.csv", 'w') for id, paper in loader.papers.iteritems(): if paper.accepted: fileobj.write("%s|%d|%s" % (paper.primarySpecificSubjectArea, id, paper.title)) for subj in paper.specificSubjectAreas: fileobj.write("|" + subj) fileobj.write("\n") fileobj.close()
from pathlib import Path from flask import Flask, render_template, make_response, jsonify, request, send_from_directory import configurations from analyzeResults import AnalyzeResults from dataLoader import DataLoader from hitCounter import HitCounter import numpy as np from vistDataset import VistDataset import base64 import time app = Flask(__name__) data_loader = DataLoader(root_path=configurations.root_data) hit_counter = HitCounter(root_path=configurations.root_data, story_max_hits=configurations.max_story_submit) vist_dataset = VistDataset(root_path=configurations.root_data, hit_counter=hit_counter, samples_num=configurations.samples) analyze_results = AnalyzeResults(data_root=configurations.root_data, data_loader=data_loader, vist_dataset=vist_dataset) @app.route('/api/images/<image_id>', methods=['GET']) def serve_image(image_id): print("Requested image file: {}".format(image_id)) image_path = data_loader._find_file(image_id) if image_path is None:
from myModel import MyModel from dataLoader import DataLoader if __name__ == '__main__': ENABLE_SAVE_MODEL = True MODEL_NAME = 'mini' # 4 mnist # H, W, C = 28, 28, 1 # (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() # 4 ukiyoe DATA_PATH = './data/' H, W, C = 224, 224, 3 RH, RW = 224, 224 x_train, y_train, x_test, y_test = DataLoader(0.2).load(DATA_PATH) if C == 1: x_train = np.sum(x_train, axis=-1) / 3 x_test = np.sum(x_test, axis=-1) / 3 x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 x_train = x_train.reshape(x_train.shape[0], H, W, C) x_test = x_test.reshape(x_test.shape[0], H, W, C) model = MyModel() loss_object = tf.keras.losses.SparseCategoricalCrossentropy() optimizer = tf.keras.optimizers.Adam() train_loss = tf.keras.metrics.Mean(name='train_loss')
from numpy import * from decimal import * from sys import * from learner import Learner from decimal import * from sys import * from utilities import * from dataLoader import DataLoader import time list1 = [[1, 2], [3, 4], [5, 6]] list2 = [2, 3] #for x in xrange(0, len(list1), 2): # print list1[x] dataloader = DataLoader() train_data = dataloader.load_sequences_from_file("../data/" + "1" + ".pautomac" + ".test") #comps = collect_unique_symbol_compositions(train_data, 2) MathiasLearner.train(train_data) #print comps.index([1, 1])
def train(): parser = OptionParser() parser.add_option("--train_inputDir", dest="train_inputDir", help="Input directory", metavar="DIRECTORY") parser.add_option("--train_inputFile", dest="train_inputFile", help="Input file", metavar="FILE") parser.add_option("--train_type", dest="train_type", help="Training type, 1|2|3|4.", metavar="VALUE", default=2) parser.add_option("--particle_number", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1) parser.add_option("--mrc_number", dest="mrc_number", help="Number of mrc files to be trained.", metavar="VALUE", default=-1) parser.add_option("--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--validation_ratio", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1) parser.add_option("--model_retrain", action="store_true", dest="model_retrain", help="train the model using the pre-trained model as parameters initialization .", default=False) parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE") parser.add_option("--model_save_dir", dest="model_save_dir", help="save the model to this directory", metavar="DIRECTORY", default="../trained_model") parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE") (opt, args) = parser.parse_args() # set the tensoflow seed tf.set_random_seed(1234) # set the numpy seed np.random.seed(1234) # define the input size of the model model_input_size = [100, 64, 64, 1] num_class = 2 # the number of the class batch_size = model_input_size[0] # define input parameters train_type = int(opt.train_type) train_inputDir = opt.train_inputDir train_inputFile = opt.train_inputFile train_number = float(opt.train_number) mrc_number = int(opt.mrc_number) coordinate_symbol = opt.coordinate_symbol debug_dir = '../train_output' # output dir particle_size = int(opt.particle_size) validation_ratio = float(opt.validation_ratio) # define the save model model_retrain = opt.model_retrain model_load_file = opt.model_load_file model_save_dir = opt.model_save_dir model_save_file = os.path.join(model_save_dir, opt.model_save_file) if not os.access(model_save_dir, os.F_OK): os.mkdir(model_save_dir) if not os.access(debug_dir, os.F_OK): os.mkdir(debug_dir) # define the learning rate decay parameters # more information about this, refer to function tf.train.exponential_decay() learning_rate = 0.01 learning_rate_decay_factor = 0.95 # the value will be changed base on the train_size and batch size learning_rate_decay_steps = 400 learning_rate_staircase = True # momentum momentum = 0.9 # load training dataset dataLoader = DataLoader() if train_type == 1: # load train data from mrc file dir train_number = int(train_number) train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_mrcFileDir(train_inputDir, particle_size, model_input_size, validation_ratio, coordinate_symbol, mrc_number, train_number) elif train_type == 2: # load train data from numpy data struct train_number = int(train_number) train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_ExtractedDataFile(train_inputDir, train_inputFile, model_input_size, validation_ratio, train_number) elif train_type == 3: # load train data from relion .star file train_number = int(train_number) train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile(train_inputFile, particle_size, model_input_size, validation_ratio, train_number) elif train_type == 4: # load train data from prepicked results train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_PrePickedResults(train_inputDir, train_inputFile, particle_size, model_input_size, validation_ratio, train_number) else: print("ERROR: invalid value of train_type:", train_type) display.show_particle(train_data, os.path.join(debug_dir, 'positive.png')) # test whether train_data exist try: train_data except NameError: print("ERROR: in function load.loadInputTrainData.") return None else: print("Load training data successfully!") # shuffle the training data train_data, train_label = shuffle_in_unison_inplace(train_data, train_label) eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label) train_size = train_data.shape[0] eval_size = eval_data.shape[0] # initalize the decay_steps based on train_size and batch size. # change the learning rate each 2 epochs learning_rate_decay_steps = 10*(train_size // batch_size) # initialize the parameters of deepModel deepModel = DeepModel(particle_size, model_input_size, num_class) deepModel.init_learning_rate(learning_rate = learning_rate, learning_rate_decay_factor = learning_rate_decay_factor, decay_steps = learning_rate_decay_steps, staircase = learning_rate_staircase) deepModel.init_momentum(momentum = momentum) # initialize the model # define the computation procedure of optimizer, loss, lr, prediction, eval_prediction deepModel.init_model_graph_train() saver = tf.train.Saver(tf.all_variables()) start_time = time.time() init = tf.initialize_all_variables() with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: # initialize all the parameters sess.run(init) max_epochs = 200 # the max number of epoch to train the model best_eval_error_rate = 100 toleration_patience = 10 toleration_patience_flag = 0 eval_frequency = train_size // batch_size # the frequency to evaluate the evaluation dataset for step in xrange(int(max_epochs * train_size) // batch_size): # get the batch training data offset = (step * batch_size) % (train_size - batch_size) batch_data = train_data[offset:(offset+batch_size), ...] batch_label = train_label[offset:(offset+batch_size)] # online augmentation #batch_data = DataLoader.preprocess_particle_online(batch_data) loss_value, lr, train_prediction = deepModel.train_batch(batch_data, batch_label,sess) # do the computation if step % eval_frequency == 0: stop_time = time.time() - start_time start_time = time.time() eval_prediction = deepModel.evaluation(eval_data, sess) eval_error_rate = error_rate(eval_prediction, eval_label) print('epoch: %.2f , %.2f ms' % (step * batch_size /train_size, 1000 * stop_time / eval_frequency)) print('train loss: %.6f,\t learning rate: %.6f' % (loss_value, lr)) print('train error: %.6f%%,\t valid error: %.6f%%' % (error_rate(train_prediction, batch_label), eval_error_rate)) if eval_error_rate < best_eval_error_rate: best_eval_error_rate = eval_error_rate toleration_patience = 10 else: toleration_patience = toleration_patience - 1 if toleration_patience == 0: saver.save(sess, model_save_file) break
def test_my_own_png(self): # load the mnist test data CSV file into a list test_data_list = DataLoader.load_my_data() self.__test_png(self.n, test_data_list)
args = get_args() setup_seed(args.seed) device = args.device checkpoint_dir = args.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) data = np.load(args.dataset_path) model = net().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) dataLoader = DataLoader(data['X'], data['Y'], train_val_split=[0.7, 0.15, 0.15], batch_size=args.batch_size, device=device) loader_train, loader_val, loader_test = dataLoader.get_loader() trainer = Trainer(model, optimizer) loss_fn = torch.nn.functional.cross_entropy trainer.train_with_val(loss_fn, loader_train=loader_train, loader_val=loader_val, epochs=args.epochs, save_path=checkpoint_dir + 'model.pth', save_best_only=True, monitor_on='acc') trainer.test(loader_test, loss_fn, info='Test ')
def learn_test(expr): loader = DataLoader() dataset = loader.loadData( dataset=expr) # dataset options: electricity, traffic, BLE pastObserve = pastObserves[expr] o_columns = dataset.columns predCol = o_columns[-1] lenAll = len(dataset) lenx = int(lenAll * .75) test_orig = [] mean_errors = [] error_stds = [] all_errors = [] all_predictions = [] values = dataset.values origData = values # normalize parameters = dataset.values.shape[1] scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) reframed = series_to_supervised(scaled, pastObserve, 1) # drop columns we don't want to predict droppings = [] for i in range(1, pastObserve + 1): x = [a for a in range(parameters * (i - 1), parameters * i - 1)] droppings.extend(x) reframed.drop(reframed.columns[droppings], axis=1, inplace=True) valuesTrans = reframed.values test = valuesTrans # split into input and outputs train_X_all, train_y_all = valuesTrans[:, :-1], valuesTrans[:, -1] test_X, test_y = test[:, :-1], test[:, -1] trainingModels = [] for i in range(modelsNo): deepModel = create_model(parameters, pastObserve) trainingModels.append(deepModel) dy = 0 sparsity = 3 for model in trainingModels: # fit network partsLen = int(len(train_X_all) / sparsity) * sparsity a = np.arange(partsLen) a = a.reshape(sparsity, int(partsLen / sparsity)) ixs = [] # just consider part of dataset not all of that for t in range(sparsity): if (t == dy): ixs.append(a[t]) # ixs.append(a[t+1]) # for considering 40% sparsity # ixs.append(a[t+2]) # for considering 60% sparsity ixs = np.array(ixs) train_ixs = ixs.flatten() train_X, train_y = train_X_all[train_ixs], train_y_all[train_ixs] model.fit(train_X, train_y, epochs=20, batch_size=20, verbose=2) dy += 1 # calculate predictions predictions = model.predict(test_X) predictions = predictions.reshape((len(predictions), 1)) pads = np.zeros(len(test_y) * (parameters - 1)) pads = pads.reshape(len(test_y), parameters - 1) inv_yhat = concatenate((pads, predictions), axis=1) inv_yhat = scaler.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:, -1] inv_yhat = np.around(inv_yhat, decimals=2) # invert scaling for actual test_y = test_y.reshape((len(test_y), 1)) inv_test = concatenate((test_X[:, pastObserve:], test_y), axis=1) test_orig = scaler.inverse_transform(inv_test) origY = test_orig[:, -1] meanErr, std, errors = report_errors(origY, inv_yhat, errorType[expr]) mean_errors.append(meanErr) error_stds.append(std) all_errors.append(errors) all_predictions.append(inv_yhat) print(min(origY), max(origY)) print(min(inv_yhat), max(inv_yhat)) print('Test Mean Error: %.3f ' % meanErr) p_cols = [] df = DataFrame(test_orig, columns=o_columns) for k in range(len(all_predictions)): colName = 'predict_' + str(k + 1) p_cols.append(colName) df[colName] = all_predictions[k] for k in range(len(all_predictions)): errName = 'error_' + str(k + 1) df[errName] = all_errors[k] print(errorType[expr]) print(mean_errors) if not os.path.exists(models_output_folder): os.makedirs(models_output_folder) outDetails_filename = models_output_folder + 'predictions_details_%s.csv' % expr out_filename = models_output_folder + 'predictions_output_%s.csv' % expr df.to_csv(outDetails_filename, index=False) models_prediction_cols = p_cols models_prediction_cols.append(predCol) df_modelOutput = df[models_prediction_cols] df_modelOutput.to_csv(out_filename, index=False)
def starcraft_svm_test(): # Create DataLoader instance to load and format data dataLoader = DataLoader() logging.info("Program started") logging.info("Loading starcraft data") # Read skillcraft dataset, the class index is the second column dataLoader.read(filename="data/SkillCraft1_Dataset.csv", classIndex=1, numOfFeatures=15) multi_label_count = dataLoader.labelCount(8) # Creates plots for a few of the data features # dataLoader.visualize() # Normalize data values from 0 - 1 #dataLoader.normalize() # Create new labels to fit into binary classification dataLoader.scaleToBinary(5) label_count = dataLoader.binaryLabelCount(5) logging.info("Number of examples per class") logging.info("Casual - (1): " + str(label_count[0])) logging.info("Hardcore - (-1): " + str(label_count[1])) label_count = dataLoader.labelCount(8) logDataCount(label_count) """ # Create SVM svm = SVM() # Train and predict for binary svm logging.info("Running SVM for binary classification") # Train for binary single run with these objects logging.info("Single binary SVM") svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test) # Train and test binary svm multiple times for all available binary variables logging.info("Multiple runs with different parameters - binary SVM") svm.train(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, iterate=True) # Save binary results to excel sheet logging.info("Saving binary SVM results") svm.results.to_excel(writer_starcraft, sheet_name='binary-svm') # MULTI CLASS SVM logging.info("Running SVM for multiclass classification") # Train and predict for multi-class data using the linear svm from liblinear implementation logging.info("Running SVM for multiclass classification with liblinear implementation") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False) logging.info("Saving multiclass liblinear results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-liblinear') # Train for multi-class single run with these objects using the libsvm implementation logging.info("Running SVM for multiclass classification with libsvm implementation") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, binary=False, linear=False) logging.info("Saving multiclass libsvm results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-libsvm') # Train and test multi-class svm multiple times for all available multi-class variables logging.info("Running SVM for multiclass classification for all available multi-class variables") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False) logging.info("Saving multiclass multiple-runs results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-multiple-variables') # Train and test multi-class svm multiple times with KPCA-LDA logging.info("Running SVM for multiclass classification with KPCA-LDA") svm.train(dataLoader.x_train, dataLoader.multi_y_train, dataLoader.x_test, dataLoader.multi_y_test, iterate=True, binary=False, decomposition=True) logging.info("Saving multiclass multiple-runs results") svm.results.to_excel(writer_starcraft, sheet_name='multiclass-kpca-lda') # KNN and NC nearest(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test, dataLoader.multi_y_train, dataLoader.multi_y_test, writer_starcraft) """ clustering(dataLoader.x_train, dataLoader.y_train, dataLoader.x_test, dataLoader.y_test) # Write all the results writer_starcraft.save()
update_checkpoint_link([('epoch_%d.pt' % best_epoch, 'best.pt'), ('epoch_%d.pt' % epoch, 'last.pt')]) epoch += 1 cost_time = time.time() - since print('Training complete in {:.0f}h{:.0f}m{:.0f}s'.format( (cost_time // 60) // 60, (cost_time // 60) % 60, cost_time % 60)) return model, cost_time, best_acc, best_train_acc if __name__ == '__main__': loader = DataLoader(args.dataset, batch_size=args.batch_size, seed=args.seed) dataloaders, dataset_sizes = loader.load_data(args.img_size) num_classes = 10 if args.dataset == 'cifar-10': num_classes = 10 if args.dataset == 'cifar-100': num_classes = 100 if args.dataset == 'VOCpart': num_classes = len(dataloaders['train'].dataset.classes) assert args.img_size == 128, 'only supports --img_size 128' model = resnet_std(depth=args.depth, num_classes=num_classes, ifmask=args.ifmask,
unique_name = 'stack_2406_2x_SumCorr_movie_DW' #unique_name = 'stack_3025_2x_SumCorr_movie_DW' coordinates = [] class_number = [] starfile = os.path.join(basepath, unique_name + new + '.star') with open(starfile) as fin: idx = 0 for l in fin: idx += 1 if idx <= 5 or l.strip() == '': continue t = map(float, l.strip().split()) coordinates.append([int(t[0]), int(t[1])]) class_number.append(int(t[2])) plot = 'test_plot_%d' % peek_cls + new + '.png' filename = os.path.join(mrcpath, unique_name + '.mrc') header, body = DataLoader.readMrcFile(filename) n_col = header[0] n_row = header[1] print n_col, n_row body_2d = np.array(body, dtype=np.float32).reshape(n_row, n_col, 1) body_2d, bin_size = DataLoader.preprocess_micrograph(body_2d) coordinates = np.array(coordinates) coordinates = coordinates / bin_size plot_circle_in_micrograph(body_2d, coordinates, class_number, 180 / bin_size, plot, color='white')
def analysis_pick_results(pick_results_file, reference_coordinate_dir, reference_coordinate_symbol, particle_size, minimum_distance_rate): """Load the picking results from a file of binary format and compare it with the reference coordinate. This function analysis the picking results with reference coordinate and calculate the recall, precision and the deviation from the center. Args: pick_results_file: string, the file name of the pre-picked results. reference_mrc_dir: string, the directory of the mrc file dir. reference_coordinate_symbol: the symbol of the coordinate, like '_manualpick' particle_size: int, the size of particle minimum_distance_rate: float, the default is 0.2, a picked coordinate is considered to be a true positive only when the distance between the picked coordinate and the reference coordinate is less than minimum_distance_rate mutiplicate particle_size. """ with open(pick_results_file, 'rb') as f: coordinate = pickle.load(f) """ coordinate: a list, the length of it stands for the number of picked micrograph file. Each element is a list too, which contains all coordinates from the same micrograph. The length of the list stands for the number of the particles. And each element in the list is a small list of length of 4. The first element in the small list is the coordinate x-aixs. The second element in the small list is the coordinate y-aixs. The third element in the small list is the prediction score. The fourth element in the small list is the micrograh name. """ tp = 0 total_pick = 0 total_reference = 0 coordinate_total = [] for i in range(len(coordinate)): mrc_filename = os.path.basename(coordinate[i][0][3]) #print(mrc_filename) reference_coordinate_file = mrc_filename.replace('.mrc', reference_coordinate_symbol+'.star') reference_coordinate_file = os.path.join(reference_coordinate_dir, reference_coordinate_file) #print(reference_coordinate_file) if os.path.isfile(reference_coordinate_file): reference_coordinate = DataLoader.read_coordinate_from_star(reference_coordinate_file) """ reference_coordinate: a list, the length of it stands for the number of picked particles. And each element in the list is a small list of length of 2. The first element in the small list is the coordinate x-aixs. The second element in the small list is the coordinate y-aixs. """ tp_sigle, average_distance = AutoPicker.calculate_tp(coordinate[i], reference_coordinate, particle_size*minimum_distance_rate) #print("tp:",tp_sigle) #print("average_distance:",average_distance) # calculate the number of true positive, when the threshold is set to 0.5 tp_sigle = 0 total_reference = total_reference + len(reference_coordinate) for j in range(len(coordinate[i])): coordinate_total.append(coordinate[i][j]) if coordinate[i][j][2]>0.5: total_pick = total_pick + 1 if coordinate[i][j][4] == 1: tp = tp + 1 tp_sigle = tp_sigle + 1 print(tp_sigle/len(reference_coordinate)) else: print("Can not find the reference coordinate:"+reference_coordinate_file) precision = tp/total_pick recall = tp/total_reference print("(threshold 0.5)precision:%f recall:%f"%(precision, recall)) # sort the coordinate based on prediction score in a descending order. coordinate_total = sorted(coordinate_total, key = itemgetter(2), reverse = True) total_tp = [] total_recall = [] total_precision = [] total_probability = [] total_average_distance = [] total_distance = 0 tp_tem = 0 for i in range(len(coordinate_total)): if coordinate_total[i][4] == 1: tp_tem = tp_tem + 1 total_distance = total_distance + coordinate_total[i][5] precision = tp_tem/(i+1) recall = tp_tem/total_reference total_tp.append(tp_tem) total_recall.append(recall) total_precision.append(precision) total_probability.append(coordinate_total[i][2]) if tp_tem==0: average_distance = 0 else: average_distance = total_distance/tp_tem total_average_distance.append(average_distance) # write the list results in file directory_pick = os.path.dirname(pick_results_file) total_results_file = os.path.join(directory_pick, 'results.txt') f = open(total_results_file, 'w') # write total_tp f.write(','.join(map(str, total_tp))+'\n') f.write(','.join(map(str, total_recall))+'\n') f.write(','.join(map(str, total_precision))+'\n') f.write(','.join(map(str, total_probability))+'\n') f.write(','.join(map(str, total_average_distance))+'\n') f.write('#total autopick number:%d\n'%(len(coordinate_total))) f.write('#total manual pick number:%d\n'%(total_reference)) f.write('#the first row is number of true positive\n') f.write('#the second row is recall\n') f.write('#the third row is precision\n') f.write('#the fourth row is probability\n') f.write('#the fiveth row is distance\n') # show the recall and precision times_of_manual = len(coordinate_total)//total_reference + 1 for i in range(times_of_manual): print('autopick_total sort, take the head number of total_manualpick * ratio %d'%(i+1)) f.write('#autopick_total sort, take the head number of total_manualpick * ratio %d \n'%(i+1)) if i==times_of_manual-1: print('precision:%f \trecall:%f'%(total_precision[-1], total_recall[-1])) f.write('precision:%f \trecall:%f \n'%(total_precision[-1], total_recall[-1])) else: print('precision:%f \trecall:%f'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1])) f.write('precision:%f \trecall:%f \n'%(total_precision[(i+1)*total_reference-1], total_recall[(i+1)*total_reference-1])) f.close()
def train(self): training_data = DataLoader.load_nmist_train_data() self.__train(self.n, training_data)
def Run_SRNN_NormalCase(args, no_dataset): data_path, graph_path = Data_path(no_dataset) log_path = Log_path(no_dataset) # Construct the DataLoader object that loads data dataloader = DataLoader(args) dataloader.load_data(data_path) # Construct the ST-graph object that reads graph stgraph = ST_GRAPH(args) stgraph.readGraph(dataloader.num_sensor, graph_path) # Initialize net net = SRNN(args) net.setStgraph(stgraph) print('- Number of trainable parameters:', sum(p.numel() for p in net.parameters() if p.requires_grad)) # optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) # optimizer = torch.optim.RMSprop(net.parameters(), lr=args.learning_rate, momentum=0.0001, centered=True) optimizer = torch.optim.Adagrad(net.parameters()) best_eval_loss = 10000 best_epoch = 0 print('') print('---- Train and Evaluation ----') eval_loss_res = np.zeros((args.num_epochs + 1, 2)) for e in range(args.num_epochs): epoch = e + 1 #### Training #### print('-- Training, epoch {}/{}'.format(epoch, args.num_epochs)) loss_epoch = 0 # For each batch for b in range(dataloader.num_batches_train): batch = b + 1 start = time.time() # Get batch data x = dataloader.next_batch_train() # Loss for this batch loss_batch = 0 # For each sequence in the batch for sequence in range(dataloader.batch_size): # put node and edge features stgraph.putSequenceData(x[sequence]) # get data to feed data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData( ) # put a sequence to net loss_output, data_nodes, outputs = forward( net, optimizer, args, stgraph, data_nodes, data_temporalEdges, data_spatialEdges) loss_output.backward() loss_batch += loss_RMSE(data_nodes[-1], outputs[-1], dataloader.scaler) # Clip gradients torch.nn.utils.clip_grad_norm_(net.parameters(), args.grad_clip) # Update parameters optimizer.step() end = time.time() loss_batch = loss_batch / dataloader.batch_size loss_epoch += loss_batch print('Train: {}/{}, train_loss = {:.3f}, time/batch = {:.3f}'. format(e * dataloader.num_batches_train + batch, args.num_epochs * dataloader.num_batches_train, loss_batch, end - start)) # Compute loss for the entire epoch loss_epoch /= dataloader.num_batches_train print('(epoch {}), train_loss = {:.3f}'.format(epoch, loss_epoch)) # Save the model after each epoch save_path = Save_path(no_dataset, epoch) print('Saving model to ' + save_path) torch.save( { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, save_path) #### Evaluation #### print('-- Evaluation, epoch {}/{}'.format(epoch, args.num_epochs)) loss_epoch = 0 for b in range(dataloader.num_batches_eval): batch = b + 1 start = time.time() # Get batch data x = dataloader.next_batch_eval() # Loss for this batch loss_batch = 0 for sequence in range(dataloader.batch_size): # put node and edge features stgraph.putSequenceData(x[sequence]) # get data to feed data_nodes, data_temporalEdges, data_spatialEdges = stgraph.getSequenceData( ) # put a sequence to net _, data_nodes, outputs = forward(net, optimizer, args, stgraph, data_nodes, data_temporalEdges, data_spatialEdges) loss_batch += loss_RMSE(data_nodes[-1], outputs[-1], dataloader.scaler) end = time.time() loss_batch = loss_batch / dataloader.batch_size loss_epoch += loss_batch print( 'Eval: {}/{}, eval_loss = {:.3f}, time/batch = {:.3f}'.format( e * dataloader.num_batches_eval + batch, args.num_epochs * dataloader.num_batches_eval, loss_batch, end - start)) loss_epoch /= dataloader.num_batches_eval eval_loss_res[e] = (epoch, loss_epoch) # Update best validation loss until now if loss_epoch < best_eval_loss: best_eval_loss = loss_epoch best_epoch = epoch print('(epoch {}), eval_loss = {:.3f}'.format(epoch, loss_epoch)) # Record the best epoch and best validation loss overall print('Best epoch: {}, Best evaluation loss {:.3f}'.format( best_epoch, best_eval_loss)) eval_loss_res[-1] = (best_epoch, best_eval_loss) np.savetxt(log_path, eval_loss_res, fmt='%d, %.3f') print('- Eval result has been saved in ', log_path) print('')
def integrated_benchmark(dataset_path): """ Variables: Dataset size: number of columns Dataset distribution: column length distribution threshold query column """ loader = DataLoader("") dataset = loader.load_dataset(dataset_path) bf_lists, lsh_list = init(dataset) print(""" Benchmark 1 Goal: Measure scalability of different methods Variable: the size of datasets. size: 400, 600, 800, 1000 Fix: threshold = 0.6 query column = median col Output: Runtime precision, recall, f1 """) labels = ["bloom filter", "lsh", "lsh ensemble", "lsh + bloom filter"] time_for_each_size = np.empty((len(dataset), len(labels)), dtype=float) x_axis = np.empty(len(dataset), dtype=int) for i, cols in enumerate(dataset): candidate_index = len(cols) // 2 # median col brute_force_result = brute_force(candidate_index, cols, 0.6) print("brute_force finished\n") time = benchmark(cols, candidate_index, 0.6, bf_lists[i], lsh_list[i], brute_force_result, "Benchmark-1-cols-size-" + str(len(cols))) time_for_each_size[i] = time x_axis[i] = len(cols) fig, ax = plt.subplots() for i in range(len(labels)): ax.plot(x_axis, time_for_each_size[:, i], 'o-', label=labels[i]) ax.legend() ax.set_title("Benchmark-1-cols-size") ax.set_xticks(x_axis) ax.set_xlabel("size") ax.set_ylabel("time(s)") fig.tight_layout() # plt.show() fig.savefig("./bench_results/Benchmark-1-cols-size") print(""" Benchmark 2 Goal: Measure the effect of threshold Variable: threshold: 0.1 0.3 0.5 0.7 0.9 Fix: dataset size = median col Output Runtime precision, recall, f1 """) threshold_list = [0.1, 0.3, 0.5, 0.7, 0.9] time_for_each_threshold = np.empty((len(threshold_list), len(labels)), dtype=float) x_axis = np.empty(len(threshold_list), dtype=float) cols_index = len(dataset) // 2 cols = dataset[cols_index] for i in range(len(threshold_list)): threshold = threshold_list[i] candidate_index = len(cols) // 2 # median col brute_force_result = brute_force(candidate_index, cols, threshold) print("brute_force finished\n") time = benchmark( cols, candidate_index, threshold, bf_lists[cols_index], lsh_list[cols_index], brute_force_result, "Benchmark-2-threshold-" + str(int(threshold * 100)) + "%") time_for_each_threshold[i] = time x_axis[i] = threshold fig, ax = plt.subplots() for i in range(len(labels)): ax.plot(x_axis, time_for_each_threshold[:, i], 'o-', label=labels[i]) ax.legend() ax.set_title("Benchmark-2-threshold") ax.set_xticks(x_axis) ax.set_xlabel("threshold") ax.set_ylabel("time(s)") fig.tight_layout() # plt.show() fig.savefig("./bench_results/Benchmark-2-threshold") print(""" Benchmark 3 Goal: Measure the effect of query column Variable: query column = small col, median col, large col Fix: dataset size = median size cols threshold = 0.6 Output Runtime precision, recall, f1 """) cols_index = len(dataset) // 2 cols = dataset[cols_index] label = ["small-col", "median-col", "large-col"] for i, candidate_index in enumerate([0, len(cols) // 2, len(cols) - 1]): brute_force_result = brute_force(candidate_index, cols, 0.6) benchmark(cols, candidate_index, 0.6, bf_lists[cols_index], lsh_list[cols_index], brute_force_result, "Benchmark-3-candidate-" + label[i])
def main(pretrain_checkpoint_dir, train_summary_writer, vocab: Vocab, dataloader: DataLoader, batch_size: int = 64, embedding_dim: int = 256, seq_length: int = 3000, gen_seq_len: int = 3000, gen_rnn_units: int = 1024, disc_rnn_units: int = 1024, epochs: int = 40000, pretrain_epochs: int = 4000, learning_rate: float = 1e-4, rollout_num: int = 2, gen_pretrain: bool = False, disc_pretrain: bool = False, load_gen_weights: bool = False, load_disc_weights: bool = False, save_gen_weights: bool = True, save_disc_weights: bool = True, disc_steps: int = 3): gen = Generator(dataloader=dataloader, vocab=vocab, batch_size=batch_size, embedding_dim=embedding_dim, seq_length=seq_length, checkpoint_dir=pretrain_checkpoint_dir, rnn_units=gen_rnn_units, start_token=0, learning_rate=learning_rate) if load_gen_weights: gen.load_weights() if gen_pretrain: gen_pre_trainer = GenPretrainer(gen, dataloader=dataloader, vocab=vocab, pretrain_epochs=pretrain_epochs, tb_writer=train_summary_writer, learning_rate=learning_rate) print('Start pre-training generator...') gen_pre_trainer.pretrain(gen_seq_len=gen_seq_len, save_weights=save_gen_weights) disc = Discriminator(vocab_size=vocab.vocab_size, embedding_dim=embedding_dim, rnn_units=disc_rnn_units, batch_size=batch_size, checkpoint_dir=pretrain_checkpoint_dir, learning_rate=learning_rate) if load_disc_weights: disc.load_weights() if disc_pretrain: disc_pre_trainer = DiscPretrainer(disc, gen, dataloader=dataloader, vocab=vocab, pretrain_epochs=pretrain_epochs, tb_writer=train_summary_writer, learning_rate=learning_rate) print('Start pre-training discriminator...') disc_pre_trainer.pretrain(save_disc_weights) rollout = Rollout(generator=gen, discriminator=disc, vocab=vocab, batch_size=batch_size, seq_length=seq_length, rollout_num=rollout_num) with tqdm(desc='Epoch: ', total=epochs, dynamic_ncols=True) as pbar: for epoch in range(epochs): fake_samples = gen.generate() rewards = rollout.get_reward(samples=fake_samples) gen_loss = gen.train_step(fake_samples, rewards) real_samples, _ = dataloader.get_batch(shuffle=shuffle, seq_length=seq_length, batch_size=batch_size, training=True) disc_loss = 0 for i in range(disc_steps): disc_loss += disc.train_step(fake_samples, real_samples) / disc_steps with train_summary_writer.as_default(): tf.summary.scalar('gen_train_loss', gen_loss, step=epoch) tf.summary.scalar('disc_train_loss', disc_loss, step=epoch) tf.summary.scalar('total_train_loss', disc_loss + gen_loss, step=epoch) pbar.set_postfix(gen_train_loss=tf.reduce_mean(gen_loss), disc_train_loss=tf.reduce_mean(disc_loss), total_train_loss=tf.reduce_mean(gen_loss + disc_loss)) if (epoch + 1) % 5 == 0 or (epoch + 1) == 1: print('保存weights...') # 保存weights gen.model.save_weights(gen.checkpoint_prefix) disc.model.save_weights(disc.checkpoint_prefix) # gen.model.save('gen.h5') # disc.model.save('disc.h5') # 测试 disc fake_samples = gen.generate(gen_seq_len) real_samples = dataloader.get_batch(shuffle=shuffle, seq_length=gen_seq_len, batch_size=batch_size, training=False) disc_loss = disc.test_step(fake_samples, real_samples) # 测试 gen gen_loss = gen.test_step() # 得到bleu_score # bleu_score = get_bleu_score(true_seqs=real_samples, genned_seqs=fake_samples) genned_sentences = vocab.extract_seqs(fake_samples) # print(genned_sentences) # print(vocab.idx2char[fake_samples[0]]) # 记录 test losses with train_summary_writer.as_default(): tf.summary.scalar('disc_test_loss', tf.reduce_mean(disc_loss), step=epoch) tf.summary.scalar('gen_test_loss', tf.reduce_mean(gen_loss), step=epoch) # tf.summary.scalar('bleu_score', tf.reduce_mean(bleu_score), step=epoch + gen_pretrain * pretrain_epochs) pbar.update()
def train(): parser = OptionParser() parser.add_option("--train_good", dest="train_good", help="Input good particles ", metavar="FILE") parser.add_option("--train_bad", dest="train_bad", help="Input bad particles", metavar="FILE") parser.add_option("--particle_number", type="int", dest="train_number", help="Number of positive samples to train.", metavar="VALUE", default=-1) parser.add_option("--bin_size", type="int", dest="bin_size", help="image size reduction", metavar="VALUE", default=3) parser.add_option( "--coordinate_symbol", dest="coordinate_symbol", help="The symbol of the coordinate file, like '_manualPick'", metavar="STRING") parser.add_option("--particle_size", type="int", dest="particle_size", help="the size of the particle.", metavar="VALUE", default=-1) parser.add_option("--validation_ratio", type="float", dest="validation_ratio", help="the ratio.", metavar="VALUE", default=0.1) parser.add_option( "--model_retrain", action="store_true", dest="model_retrain", help= "train the model using the pre-trained model as parameters initialization .", default=False) parser.add_option("--model_load_file", dest="model_load_file", help="pre-trained model", metavar="FILE") parser.add_option("--logdir", dest="logdir", help="directory of logfiles", metavar="DIRECTORY", default="Logfile") parser.add_option("--model_save_file", dest="model_save_file", help="save the model to file", metavar="FILE") (opt, args) = parser.parse_args() np.random.seed(1234) # define the input size of the model model_input_size = [100, 64, 64, 1] num_classes = 2 # the number of output classes batch_size = model_input_size[0] if not os.access(opt.logdir, os.F_OK): os.mkdir(opt.logdir) # load training dataset dataLoader = DataLoader() train_data, train_label, eval_data, eval_label = dataLoader.load_trainData_From_RelionStarFile( opt.train_good, opt.particle_size, model_input_size, opt.validation_ratio, opt.train_number, opt.bin_size) # Check if train_data exist try: train_data except NameError: print("ERROR: in function load.loadInputTrainData.") return None else: print("Load training data successfully!") # shuffle training data train_data, train_label = shuffle_in_unison_inplace( train_data, train_label) eval_data, eval_label = shuffle_in_unison_inplace(eval_data, eval_label) train_x = train_data.reshape(train_data.shape[0], 64, 64, 1) test_x = eval_data.reshape(eval_data.shape[0], 64, 64, 1) print("shape of training data: ", train_x.shape, test_x.shape) train_y = to_categorical(train_label, 2) test_y = to_categorical(eval_label, 2) print(train_y.shape, test_y.shape) datagen = ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True, rotation_range=20, width_shift_range=0.0, height_shift_range=0.0, horizontal_flip=True, vertical_flip=True) datagen.fit(train_x) model = Sequential() model.add( Conv2D(32, kernel_size=(8, 8), strides=(1, 1), activation='relu', input_shape=(64, 64, 1))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(64, kernel_size=(8, 8), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(128, kernel_size=(3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dense(num_classes, activation='softmax')) for layer in model.layers: print(layer.name, layer.output_shape) logdir = opt.logdir + '/' + datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = TensorBoard(log_dir=logdir) checkpoint = ModelCheckpoint('best_model.h5', monitor='val_acc', verbose=1, save_best_only=True, period=1) reduce_lr_plateau = ReduceLROnPlateau(monitor='val_acc', patience=10, verbose=1) callbacks = [checkpoint, reduce_lr_plateau, tensorboard_callback] model.compile(optimizer=SGD(0.01), loss="binary_crossentropy", metrics=["accuracy"]) model.fit_generator(datagen.flow(train_x, train_y, batch_size=batch_size), steps_per_epoch=len(train_x) / 32, epochs=30, validation_data=(test_x, test_y), callbacks=callbacks) model.save(opt.model_save_file) accuracy = model.evaluate(x=test_x, y=test_y, batch_size=batch_size) print("Accuracy:", accuracy[1])
from dataLoader import DataLoader from crfBrandDetector import CrfBrandDetector if __name__ == '__main__': print('Preparing Data...') df = DataLoader().get_data() print('Building Model...') crf_model = CrfBrandDetector() print('Fitting...') x_train, x_test, y_train, y_test = crf_model.train_test_split(df) crf_model.fit(x_train, y_train) crf_model.report_classification(x_test, y_test) print('Accuracy: {}'.format(crf_model.evaluate(x_test, y_test))) pred = crf_model.predict(x_test) pred.to_csv('./pred.csv', index=False)
'loss': loss, }, os.path.join(args.exp_dir , 'unfinished_model.pt')) epoch += 1 cost_time = time.time() - since print ('Training complete in {:.0f}m {:.0f}s'.format(cost_time//60,cost_time%60)) print ('Best Train Acc is {:.4f}'.format(best_train_acc)) print ('Best Val Acc is {:.4f}'.format(best_acc)) model.load_state_dict(best_model) return model,cost_time,best_acc,best_train_acc if __name__ == '__main__': print ('DataSets: '+args.dataset) print ('ResNet Depth: '+str(args.depth)) loader = DataLoader(args.dataset,batch_size=args.batch_size) dataloaders,dataset_sizes = loader.load_data() num_classes = 10 if args.dataset == 'cifar-10': num_classes = 10 if args.dataset == 'cifar-100': num_classes = 100 model = resnet_cifar(depth=args.depth, num_classes=num_classes) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True, weight_decay=1e-4) # define loss and optimizer criterion = nn.CrossEntropyLoss() scheduler = MultiStepLR(optimizer, milestones=[args.epoch*0.4, args.epoch*0.6, args.epoch*0.8], gamma=0.1)
from dataLoader import DataLoader loader = DataLoader() loader.loadAll() fileobj = open("csv/subjectAreaDump.csv", 'w') for id, paper in loader.papers.iteritems(): if paper.accepted: fileobj.write("%s|%d|%s" % ( paper.primarySpecificSubjectArea, id, paper.title)) for subj in paper.specificSubjectAreas: fileobj.write("|" + subj) fileobj.write("\n") fileobj.close()