def main(training_from_scratch, args): if (training_from_scratch): text = open(args.filename, 'rb').read().decode(encoding='utf-8') text, char2idx, idx2char = preprocessing( text, args.checkpoint_dir, args.minocc) # note that we are replacing the text here vocab_size = len(idx2char) config = Config(vocab_size, args.epochs) model = build_model(config) else: model = tf.keras.models.load_model(args.checkpoint) char2idx = unpickle(args.checkpoint_dir, 'char2idx') idx2char = unpickle(args.checkpoint_dir, 'idx2char') text = unpickle(args.checkpoint_dir, 'dataset') vocab_size = len(idx2char) config = Config(vocab_size, args.epochs, args.initepochs) text_as_int = np.array([char2idx[c] for c in text ]) # works because text is a list of words train_model(args.checkpoint_dir, text_as_int, model, config)
def gen_dataset(N, data_path, isMC=True): data_path = os.path.abspath(data_path) ut.create_dir(data_path) if (isMC): movie_dict = ut.unpickle( os.path.join("data", "movies_single_index.pkl")) else: movie_dict = ut.unpickle(os.path.join( "data", "movies_multi_index.pkl")) #Need to make this new index thing full_ids = list(movie_dict.keys()) movie_ids = random.sample(full_ids, N) ids_l = len(movie_ids) shuffle(movie_ids) train_ids = movie_ids[:int(ids_l * SPLIT[0])] val_ids = movie_ids[int(ids_l * SPLIT[0]):int(ids_l * (SPLIT[0] + SPLIT[1]))] test_ids = movie_ids[int(ids_l * (SPLIT[0] + SPLIT[1])):-1] print("Generating Training Set...") generate_set(train_ids, movie_dict, data_path, "train", isMC) print("Generating Validation Set...") generate_set(val_ids, movie_dict, data_path, "val", isMC) print("Generating Test Set...") generate_set(test_ids, movie_dict, data_path, "test", isMC)
def prepare_for_rec(out_dir): meta = {} meanImg = readAndResize('/database/test.jpg',SIZE, CHANNELS) meanImgstd = readAndResize('/database/test.jpg', SIZE_STD, CHANNELS) meta['data_mean'] = meanImg meta['data_mean_std'] = meanImgstd util.pickle(os.path.join(out_dir,"batches.meta"), meta) trainingMeta = util.unpickle(os.path.join(out_dir, "trainingMeta.meta")) testMeta = util.unpickle(os.path.join(out_dir, "testMeta.meta")) print "prepare for training" random.shuffle(trainingMeta) make_list_batches(trainingMeta,out_dir,NUM_PER_PATCH) #for test print "prepare for test" random.shuffle(testMeta) make_list_batches(trainingMeta,out_dir,NUM_PER_PATCH,8000)
def __init__(self): # self.classifier = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/cl_gun.txt") # self.featurenum = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/featurenum_gun.txt") print "Learner initialized for presidential_elections" self.classifier = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/cl_immigration.txt") self.featurenum = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/featurenum_immigration.txt") print "Learner initialized"
def __init__(self, data_file, root_path, data_mode="train", random_transform=False, batch_size=128, crop_width=224, crop_height=224): # read image-name image-index map from file self.data = unpickle(data_file) self.num_classes = len(self.data['index_map']) self.data_mode = data_mode self.random_transform = random_transform self.root_path = root_path if data_mode == "all": index_map = self.data['index_map'] elif data_mode == "val": index_map = self.data['index_map_val'] elif data_mode == "train": index_map = self.data['index_map_train'] else: print "data_mode: " + data_mode + " not valid" import pdb pdb.set_trace() sys.exit(1) # get batch queue self.batch_queue = [] has_add = True while has_add: has_add = False for i in range(self.num_classes): if len(index_map[i]) > 0: index = index_map[i].pop() self.batch_queue.append(index) has_add = True self.num_images = len(self.batch_queue) #init current index and batch size self.batch_size = batch_size self.prev_batch_size = batch_size self.crop_width = crop_width self.crop_height = crop_height self.batch_index = 1 self.epoch = 1 # read data mean from file data_mean_file = unpickle(data_file + MEAN_FILE_EXT) self.data_mean = data_mean_file['data']
def prepareTrain(folderCls, imgStdCls, meanImg_dir, out_dir): global NUM_PER_BATCH meanImg = util.unpickle(meanImg_dir + '/meanImg') meanImgStd = util.unpickle(meanImg_dir + '/meanImgStd') allImgMeta = collectAndShuffle(folderCls, imgStdCls) meta = {} meta['data_mean'] = meanImg meta['data_mean_std'] = meanImgStd util.pickle( os.path.join(out_dir, "batches.meta"), meta) makeBatches(allImgMeta, out_dir, NUM_PER_BATCH) out_file = out_dir + "/imglist" util.pickle(out_file, allImgMeta)
def prepareTrain(train_dir, stdImgfolder, out_dir, meanImg_dir, startIdx): global NUM_PER_BATCH meanImg = util.unpickle(meanImg_dir + '/meanImg') meanImgStd = util.unpickle(meanImg_dir + '/meanImgStd') allImgMeta, allLabels = collectAndShuffle(train_dir, stdImgfolder) meta = {} meta['data_mean'] = meanImg meta['data_mean_std'] = meanImgStd util.pickle( os.path.join(out_dir, "batches.meta"), meta) makeBatches(allImgMeta, out_dir, NUM_PER_BATCH, startIdx) out_file = out_dir + "/imglist" util.pickle(out_file, allImgMeta)
def processTest(test_list, out_dir, startIdx): global NUM_PER_PATCH meta = util.unpickle(os.path.join(out_dir, "batches.meta")) allLabels = meta['label_names'] fileList = open(test_list,'rb').readlines() random.shuffle(fileList) print "####### Got %d classes ######" % len(allLabels) print "####### Got %d images ######" % len(fileList) numImg = len(fileList) numBatches = numImg / NUM_PER_PATCH # the last batch keep the remainder if numImg % NUM_PER_PATCH != 0: numBatches += 1 print 'Going to make %d baches' % numBatches for idx_batch in range(numBatches): # if idx_batch < numBatches - 2: # continue print "### Making the %dth batch ###" % idx_batch b_start = NUM_PER_PATCH * idx_batch b_end = NUM_PER_PATCH * (idx_batch + 1) if idx_batch == numBatches - 1: b_start = numImg - NUM_PER_PATCH b_end = numImg batchMeta = fileList[b_start:b_end] data, labels, imgnames = getBatch(batchMeta,allLabels) out_fname = os.path.join(out_dir, "data_batch_%04d" % (idx_batch+startIdx)) print "saving to %s" % out_fname util.pickle(out_fname, {'data':data, 'labels':labels, 'images':imgnames})
def processTest(test_dir, out_dir, startIdx): global NUM_PER_PATCH out_file = out_dir + "/imglist" trsimgMeta, trsLabels = util.unpickle(out_file) allImgMeta = [] allLabels = [] subdirnames = os.listdir(test_dir) list.sort(subdirnames) #for classLabel, subdir in enumerate(trsLabels): for num,test_sub_dir in enumerate(subdirnames): label = trsLabels.index(test_sub_dir) imgnames = os.listdir(os.path.join(test_dir, test_sub_dir)) fullnames = [os.path.join(test_dir, test_sub_dir, name) for name in imgnames] meta = zip(fullnames, [label] * len(fullnames)) allImgMeta += meta print "####### Got %d classes ######" % len(subdirnames) print "####### Got %d images ######" % len(allImgMeta) print "shuffling..." random.shuffle(allImgMeta) makeBatches(allImgMeta, out_dir, NUM_PER_PATCH, startIdx)
def __init__(self, data_file, mean_file, root_path, data_mode="train", batch_index=0, epoch_index=1, random_transform=False, batch_size=128, crop_width=224, crop_height=224, buffer_size=2): # init data Q self.raw_image_queue = Queue(buffer_size) self.batch_data_queue = Queue(buffer_size) self.stop = False print 'Loading data from ' + str(data_file) self.data = unpickle(data_file) # init read/tranfrom image object self.readImage = ReadImage(self.raw_image_queue, self.data, mean_file, root_path, data_mode, batch_size, batch_index, epoch_index) self.processImage = ProcessImage(self.data, self.raw_image_queue, self.batch_data_queue, crop_width, crop_height, self.readImage.data_mean, data_mode, random_transform)
def convert_weight_matrix_default(pretrain_file, tf_modelfile): print('Loading word embedding...') embedding = reduce_word_embedding() print('Loading models...') d = unpickle(pretrain_file) vars = {} print('Converting model...') for vname in d.keys(): vars[vname] = d[vname].get_value() # slice theano variables gWx = vars['encoder_W'] # 620x4800 gWh = vars['encoder_U'] # 2400x4800 gb = vars['encoder_b'] # 1x4800 U = vars['encoder_Ux'] # 2400x2400 W = vars['encoder_Wx'] # 620x2400 b = vars['encoder_bx'] # 1x2400 # arrange to tensorflow format gate_w = np.concatenate([gWx, gWh], axis=0) gate_b = gb candx_w = W candu_w = U candx_b = b weight_dict = { 'Gates/Linear/Matrix': gate_w, 'Gates/Linear/Bias': gate_b, 'CandidateW/Linear/Matrix': candx_w, 'CandidateW/Linear/Bias': candx_b, 'CandidateU/Linear/Matrix': candu_w, 'word_embedding/map': embedding } print('Saving...') np.save(tf_modelfile, weight_dict) print('Model converted successfully!')
def __init__(self, data_path, split): dataset = util.unpickle(data_path) self.data = dataset['data'] self.class_mapping = dataset[ 'class_mapping'] # Maps raw class to mapped class self.inv_class_mapping = { self.class_mapping[c]: c for c in self.class_mapping } self.ids = sorted(list(self.data.keys())) random.Random(0).shuffle(self.ids) self.split = np.array(split) / np.sum(split) self.split_sizes = np.floor(self.split * len(self.ids)) self.split_index = 0 self.split_starts = np.insert(np.cumsum(self.split_sizes), 0, [0]) self.split_starts = [ int(split_start) for split_start in self.split_starts ] self.id_sets = [] for i in range(len(self.split_sizes) - 1): self.id_sets.append( set(self.ids[self.split_starts[i]:self.split_starts[i + 1]])) np.random.seed(0) self.noisy_real = np.random.rand(len(self.ids)) * 0.1 + 0.9 self.noisy_fake = np.random.rand(len(self.ids)) * 0.1
def __init__( self, data_file, mean_file, root_path, data_mode = "train", batch_index = 0, epoch_index = 1, random_transform = False, batch_size = 128, crop_width = 224, crop_height = 224, buffer_size = 2 ): # init data Q self.raw_image_queue = Queue( buffer_size ) self.batch_data_queue = Queue( buffer_size ) self.stop = False print 'Loading data from ' + str(data_file) self.data = unpickle(data_file) # init read/tranfrom image object self.readImage = ReadImage(self.raw_image_queue, self.data, mean_file, root_path, data_mode, batch_size, batch_index, epoch_index ) self.processImage = ProcessImage(self.data, self.raw_image_queue, self.batch_data_queue, crop_width, crop_height, self.readImage.data_mean, data_mode, random_transform)
def makew(name, idx, shape, params=None): foo=u.unpickle('/data/t-maoqu/convnets/netsaves/imnet21841aggregated/7.1500') #foo=u.unpickle('/home/NORTHAMERICA/t-maoqu/share/net/47.250') if name=='transfer': return 0.1*n.ones((shape[0], shape[1]), n.single) for i in foo['model_state']['layers']: if i['name']==name: return i['weights'][idx]
def load_features(self): features = unpickle(self.parameters["pickle_features"]) self.x_train = features['x_train'] self.x_test = features['x_test'] self.y_train = features['y_train'] self.y_test = features['y_test'] self.x_scaler = features['x_scaler'] self.parameters = features['parameters']
def test(): from util import unpickle import json from inference_utils.question_generator_util import SentenceGenerator from w2v_answer_encoder import MultiChoiceQuestionManger config = MLPConfig() model = SequenceMLP(config, phase='test') model.build() prob = model.prob # Load vocabulary to_sentence = SentenceGenerator(trainset='trainval') # create multiple choice question manger mc_manager = MultiChoiceQuestionManger(subset='trainval', answer_coding='sequence') sess = tf.Session() # Load model ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) checkpoint_path = ckpt.model_checkpoint_path saver = tf.train.Saver() saver.restore(sess, checkpoint_path) # get data result = [] dataset = unpickle('data/rescore_dev.pkl') for itr, datum in enumerate(dataset): seq_index, att_mask, label = _process_datum(datum) quest_id = datum['quest_id'] quest = seq_index[0].tolist() feed_dict = model.fill_feed_dict([seq_index, att_mask]) scores = sess.run(prob, feed_dict=feed_dict) idx = scores.argmax() # parse question and answer question = to_sentence.index_to_question([0] + quest) mc_ans = mc_manager.get_candidate_answers(quest_id) vaq_answer = mc_ans[idx] real_answer = mc_ans[label.argmax()] # add result result.append({u'answer': vaq_answer, u'question_id': quest_id}) # show results if itr % 100 == 0: print('============== %d ============' % itr) print('question id: %d' % quest_id) print('question\t: %s' % question) print('answer\t: %s' % real_answer) print('VAQ answer\t: %s (%0.2f)' % (vaq_answer, scores[idx])) quest_ids = [res[u'question_id'] for res in result] # save results tf.logging.info('Saving results') res_file = 'result/rescore_dev_dev.json' json.dump(result, open(res_file, 'w')) from vqa_eval import evaluate_model acc = evaluate_model(res_file, quest_ids) print('Over all accuarcy: %0.2f' % acc)
def __init__( self, data_file, root_path, data_mode = "train", random_transform = False, batch_size = 128, crop_width = 224, crop_height = 224 ): # read image-name image-index map from file self.data = unpickle( data_file ) self.num_classes = len( self.data['index_map'] ) self.data_mode = data_mode self.random_transform = random_transform self.root_path = root_path if data_mode == "all": index_map = self.data['index_map'] elif data_mode == "val": index_map = self.data['index_map_val'] elif data_mode == "train": index_map = self.data['index_map_train'] else: print "data_mode: " + data_mode + " not valid" import pdb; pdb.set_trace() sys.exit(1) # get batch queue self.batch_queue = [] has_add = True while has_add: has_add = False for i in range( self.num_classes ): if len(index_map[i]) > 0: index = index_map[i].pop() self.batch_queue.append( index ) has_add = True self.num_images = len( self.batch_queue ) #init current index and batch size self.batch_size = batch_size self.prev_batch_size = batch_size self.crop_width = crop_width self.crop_height = crop_height self.batch_index = 1 self.epoch = 1 # read data mean from file data_mean_file = unpickle( data_file + MEAN_FILE_EXT ) self.data_mean = data_mean_file['data']
def main(): num_args = len(sys.argv) # load result from file num_nets = num_args - 1 assert( num_nets > 0 ) errors = [] # 0th net # result['labels'] # result['preds'] result = unpickle( sys.argv[1] ) errors.append( evaluate_result( result, sys.argv[1] ) ) num_batches = len( result['labels'] ) #import pdb; pdb.set_trace() # collet all results for ii in range( num_nets - 1 ): result_ii = unpickle( sys.argv[ii+2] ) # evaluate result_ii errors.append( evaluate_result( result_ii, sys.argv[ii+2] ) ) # check num of batches is consistant num_batches_ii = len( result_ii['labels'] ) for jj in range( num_batches ): # check label is consistant assert( np.array_equal( result_ii['labels'][jj], result['labels'][jj] ) ) # nc result['pred'][jj] result['preds'][jj] += result_ii['preds'][jj] pickle( 'combine_result', result ) # classifier mean/std accuracy errors = np.array( errors ) #import pdb; pdb.set_trace() print "mean: " , str(100*np.mean( errors )) , " std: " , str(100*(np.std( errors ))) # evaluate result evaluate_result( result, "After combine" )
def main(word_level, path_to_model): """Run the bot.""" global update_id global lang_model model = tf.keras.models.load_model(os.path.join(path_to_model,"checkpoint_release.h5")) char2idx = unpickle(path_to_model, 'char2idx') idx2char = unpickle(path_to_model, 'idx2char') if( word_level ): lang_model = WordLanguageModel(model, char2idx, idx2char) else: lang_model = CharLanguageModel(model, char2idx, idx2char) # Telegram Bot Authorization Token bot = telegram.Bot(open("access_token.txt", 'r').read()) # get the first pending update_id, this is so we can skip over it in case # we get an "Unauthorized" exception. try: update_id = bot.get_updates()[0].update_id except IndexError: update_id = None logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') while True: try: echo(bot) except NetworkError: sleep(1) except Unauthorized: # The user has removed or blocked the bot. update_id += 1
def recvServer(self): try: msg = self.sub_socket.recv(flags=zmq.core.NOBLOCK) except zmq.core.error.ZMQError: pass else: sep = msg.find(':') to = msg[0:sep] foreign = util.unpickle(msg[sep+1:]) for n, obj in foreign.items(): if n == self.p1.mini.name: del foreign[n] self.updatePlayers(foreign)
def prepareTrain(train_dir, out_dir, meanImg_dir): global NUM_PER_PATCH #train_dir = "/data1/LSVRC2010/train" #out_dir = "/data2/ILSVRC2010/train_batches" meanImg = util.unpickle(meanImg_dir + '/meanImg') allImgMeta, allLabels = collectAndShuffle(train_dir) meta = {} meta['data_mean'] = meanImg meta['label_names'] = allLabels util.pickle( os.path.join(out_dir, "batches.meta"), meta) makeBatches(allImgMeta, out_dir, NUM_PER_PATCH) out_file = out_dir + "/imglist" util.pickle(out_file, [allImgMeta, allLabels])
def weight_from_checkpoint(path, name, idx, shape): checkpoint = util.unpickle(path) layers = checkpoint['model_state']['layers'] layer_names = [layer['name'] for layer in layers] match = [i for i, layer_name in enumerate(layer_names) if layer_name==name] if len(match)>1: raise Exception('More than one matching layer found.') if len(match)<1: raise Exception('No matching layer found.') weights = layers[match[0]]['weights'][idx] if weights.shape != shape: print 'Shape mismatch:' print 'yours:', shape print 'mine:', weights.shape return weights
def getMeanImg(raw_dir): global SIZE, CHANNELS flist = os.listdir(raw_dir) list.sort(flist) globalSum = np.zeros(SIZE*SIZE*CHANNELS, dtype=np.float64) globalCount = 0 raw_info = [] for label, fname in enumerate(flist): print "Reading", fname data = util.unpickle(raw_dir + '/' + fname) dataSum = np.sum(data, axis=0, dtype = np.float64) globalSum += dataSum globalCount += data.shape[0] raw_info.append((fname, label, data.shape[0])) #(name of the label, label value, number of images) meanImg = globalSum / globalCount return meanImg, raw_info
def __init__(self, root, json, vocab, transform=None): """Set the path for images, captions and vocabulary wrapper. Args: root: image directory. json: coco annotation file path. vocab: vocabulary wrapper. transform: image transformer. """ self.root = root self.coco = COCO(json) self.ids = list(self.coco.anns.keys()) self.vocab = vocab self.transform = transform # self.tokenized_text_list = unpickle('tokenized_bokete_text.pkl') self.tokenized_text_list = unpickle('tokenized_text_list_mecab.pkl')
def get_next_batch(self): if self.show: epoch, batchnum, bidx = LabeledDataProvider.get_next_batch(self) datadic = unpickle(self.data_dir+"data_batch_"+str(batchnum)) datadic['data'] = n.require(datadic['data']/256., requirements='C', dtype=n.single) datadic['labels'] = n.require(datadic['labels'].reshape(1,datadic['data'].shape[1]), dtype=n.single, requirements='C') return epoch, batchnum, [datadic['data'], datadic['labels']] else: epoch, batchnum, bidx = LabeledDataProvider.get_next_batch(self) if self.test: datadic = self.data.get_batch(bidx + len(self.data.train_batch_range)) else: datadic = self.data.get_batch(bidx) datadic['data'] = n.require(datadic['data']/256., requirements='C', dtype=n.single) datadic['labels'] = n.require(datadic['labels'].reshape(1,datadic['data'].shape[1]), dtype=n.single, requirements='C') return epoch, batchnum, [datadic['data'], datadic['labels']]
def __init__(self, meta_path, crop_border, multiview_test): self.batch_meta = unpickle(meta_path) self.border_size = crop_border self.num_colors = 3 self.outer_size = int((self.batch_meta['data_mean'].size / self.num_colors) ** 0.5) self.inner_size = self.outer_size - self.border_size*2 self.multiview = multiview_test self.num_views = 5 * 2 self.data_mult = self.num_views if self.multiview else 1 self.label_types = self.batch_meta['label_types'] self.label_names = self.batch_meta['label_names'] self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors, self.outer_size, self.outer_size)) self.data_mean = self.data_mean[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size] self.data_mean = self.data_mean.reshape((self.get_data_dims(), 1)) self.cropped_data = np.zeros((self.get_data_dims(), self.data_mult), dtype=np.single)
def train(): from util import unpickle import os import numpy.random as nr # Create training directory. train_dir = FLAGS.train_dir if not tf.gfile.IsDirectory(train_dir): tf.logging.info("Creating training directory: %s", train_dir) tf.gfile.MakeDirs(train_dir) # Create model max_iter = 1000000 config = MLPConfig() model = SequenceMLP(config, phase='train') model.build() loss = model.tot_loss # global_step = model.global_stepgg train_op = tf.train.AdamOptimizer(learning_rate=5e-4).minimize(loss) sess = tf.Session() sess.run(tf.initialize_all_variables()) dataset = unpickle('data/rescore_trainval.pkl') import pdb pdb.set_trace() saver = tf.train.Saver(max_to_keep=5) num = len(dataset) for i in range(max_iter): sample_id = nr.randint(0, num) datum = dataset[sample_id] seq_index, att_mask, label = _process_datum(datum) feed_dict = model.fill_feed_dict([seq_index, att_mask, label]) _, obj = sess.run([train_op, model.tot_loss], feed_dict=feed_dict) if i % 100 == 0: tf.logging.info('Iteration %d, loss=%0.2f' % (i, obj)) if i % 5000 == 0: saver.save(sess, os.path.join(FLAGS.train_dir, 'model-%d.ckpt' % i))
def processTest(test_dir, out_dir, startIdx): global NUM_PER_PATCH out_file = out_dir + "/imglist" trsimgMeta, trsLabels = util.unpickle(out_file) allImgMeta = [] allLabels = [] subdirnames = os.listdir(test_dir) list.sort(subdirnames) for classLabel, subdir in enumerate(trsLabels): for num,test_sub_dir in enumerate(subdirnames): # print classLabel # print test_sub_dir # print subdir if test_sub_dir == subdir: imgnames = os.listdir(os.path.join(test_dir, subdir)) fullnames = [os.path.join(test_dir, subdir, name) for name in imgnames] meta = zip(fullnames, [classLabel] * len(fullnames)) allImgMeta += meta # #allLabels.append(subdir) # # # # name, label pair # #meta = zip(fullnames, [string.atoi(subdir)] * len(fullnames)) # # print "####### Got %d classes ######" % len(subdirnames) print "####### Got %d images ######" % len(allImgMeta) print "shuffling..." random.shuffle(allImgMeta) #return allImgMeta, allLabels #allImgMeta, testLabels = collectAndShuffle(test_dir) makeBatches(allImgMeta, out_dir, NUM_PER_PATCH, startIdx)
def _active_learning_for_learner_strategy( self, label: str, learner: BaseEstimator, sampling_strategy: Callable, active_learning_data: ActiveLearningData, semi_sup: bool = False) -> Stats: data_for_plotting = [] file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path( label, learner, sampling_strategy) # used for label propagation labeled_indices = [] if os.path.exists(file_path_pkl): if self.verbose: print('Available, retrieving...') return util.unpickle(file_path_pkl) # initialize stats stats = self._initialize_stats(label, learner_name, sampling_strategy_name) # initial training clf, stats, data_for_plotting = self._active_learning_initial_training( semi_sup, stats, data_for_plotting, learner, sampling_strategy, active_learning_data, labeled_indices) # actively learn one analyst query at a time for i in range(self.active_learning_budget): stats, data_for_plotting, labeled_indices = self._active_learning_single_query( i, semi_sup, clf, sampling_strategy, active_learning_data, stats, data_for_plotting, labeled_indices) # persist the results util.pickle_object(stats, file_path_pkl) util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv) return stats
def build_tree(filename): """ Build the tree of classes from a json specification """ with open(filename, 'rb') as json_file: json_data = json.load(json_file) meta = unpickle('cifar-100-python/meta') tree = _build_tree(json_data, meta['fine_label_names'])[0] # Get the number of classes per level n_classes = [] queue = deque(tree.children) while queue: n_classes.append(len(queue)) for i in range(len(queue)): node = queue.popleft() node.index = i if node.children: for child in node.children: queue.append(child) return tree, n_classes
def preprocessing(): folder = r'D:\work\sunxiuyu\SVHN\large-lcn' #outfolder = r'D:\work\sunxiuyu\SVHN\svhn-valid-large-1' datasize = 32 * 32 * 3 #meta = util.unpickle(metafile) #mean = np.zeros(datasize,np.double) num = 0 begin = 25 for i in range(begin, begin + 16): batch_file = os.path.join(folder, 'data_batch_' + str(i)) print batch_file buffer = util.unpickle(batch_file) data = buffer['data'] dim2 = len(data) data = np.transpose(data) dim1 = len(data) print dim1 newbuffer = np.zeros((dim1, dim2), np.single) for i in range(0, len(data)): img1 = data[i].reshape(3, 32, 32) img = np.zeros((32, 32, 3), np.single) result = np.zeros((3, 32, 32), np.single) img[:, :, 0] = img1[0, :, :] img[:, :, 1] = img1[1, :, :] img[:, :, 2] = img1[2, :, :] #cv2.imshow( "img1",img ) #cv2.waitKey(0) result[0, :, :] = img[:, :, 0] result[1, :, :] = img[:, :, 1] result[2, :, :] = img[:, :, 2] #print result[0,:,:] newbuffer[i] = result.reshape(3 * 32 * 32) newbuffer = np.transpose(newbuffer) buffer['data'] = newbuffer util.pickle(batch_file, buffer) return
# label_names = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] labels_dict = { i: x for i, x in enumerate( label_names ) } writer = csv.writer( open( output_file, 'wb' )) writer.writerow( [ 'id', 'label' ] ) counter = 1 ### # 0th net # result['labels'] # result['preds'] result = unpickle( sys.argv[1] ) num_batches = len( result['labels'] ) for ii in range( num_nets - 1 ): result_ii = unpickle( sys.argv[ii+2] ) # check num of batches is consistant num_batches_ii = len( result_ii['labels'] ) for jj in range( num_batches ): # check label is consistant assert( np.array_equal( result_ii['labels'][jj], result['labels'][jj] )) # nc result['pred'][jj] result['preds'][jj] += result_ii['preds'][jj] #print len( result['preds'] ) #pickle( output_file, result )
def train_multiclassifier(dataset_path, weights): CE_CRITERION = nn.CrossEntropyLoss(weight=weights) train_path = os.path.join(DATASET_RAW_PATH, dataset_path, TRAIN_DATA) train_data = MovieDataset(train_path) train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) val_path = os.path.join(DATASET_RAW_PATH, dataset_path, VAL_DATA) val_data = MovieDataset(val_path) val_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True) # Instantiate the models MC = MultiClassifier().to(device) Optimizer = torch.optim.Adam(MC.parameters(), lr=ADAM_ALPHA, betas=ADAM_BETA, weight_decay=REGULARIZATION) print('Training.') for val_batch_i, val_batch_data in enumerate(val_loader): val_imgs, val_labels, val_indices = val_batch_data val_imgs = Variable(val_imgs.type(FloatTensor)).to(device) val_labels = torch.stack(val_labels) val_labels = torch.transpose(val_labels, 0, 1) val_labels = Variable(val_labels).to(device) break training_loss_data_path = os.path.join(DATASET_RAW_PATH, "0_0000005" + dataset_path + ".csv") training_loss_data = [] for epoch_index in range(N_EPOCHS): # loop over the dataset multiple times for batch_index, batch_data in enumerate(train_loader): # get the inputs imgs, labels, indices = batch_data imgs = Variable(imgs.type(FloatTensor)).to(device) labels = torch.stack(labels) labels = torch.transpose(labels, 0, 1) labels = Variable(labels).to(device) #labels = Variable(labels.to(device)) # zero the parameter gradients Optimizer.zero_grad() # forward + backward + optimize outputs = MC(imgs) loss = CE_CRITERION(outputs, torch.max(labels, 1)[1]) loss.backward() Optimizer.step() if (batch_index % 100) == 0: val_outputs = MC(val_imgs) val_loss = CE_CRITERION(val_outputs, torch.max(val_labels, 1)[1]) training_loss_data.append((loss.item(), val_loss.item())) # Print Loss if epoch_index % PRINT_INTERVAL == 0 and not batch_index: # print every 2000 mini-batches print('Epoch: %d \tTraining Loss: %.3f' % (epoch_index, loss)) val_outputs = MC(val_imgs) val_loss = CE_CRITERION(val_outputs, torch.max(val_labels, 1)[1]) print('Epoch: %d \tValidation Loss: %.3f' % (epoch_index, val_loss)) # break print('Finished Training. Testing on Validation Set.') save_training_loss(training_loss_data_path, training_loss_data) #Validation Set #val_path = os.path.join(DATASET_RAW_PATH, dataset_path, VAL_DATA) #val_data = MovieDataset(val_path) #val_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True) lenient_predicted = 0 correct_predicted = 0 total_predicted = 0 classes = ut.unpickle(CLASS_INDECES_RAW_PATH) movie_dict = ut.unpickle("data/movies_multi_index.pkl") with torch.no_grad(): for data in val_loader: imgs, labels, indices = data imgs = Variable(imgs.type(FloatTensor)).to(device) labels = torch.stack(labels) labels = torch.transpose(labels, 0, 1) labels = Variable(labels).to(device) outputs = MC(imgs) outputs = F.softmax(outputs.data, dim=1) _, predicted = torch.max(outputs.data, 1) #total_predicted += labels.size(0) #correct_predicted += (labels[predicted] == 1).sum().item() print("MOVIE IDS") print(indices) for batch_i in range(predicted.size(0)): total_predicted += 1 correct_predicted += ( labels[batch_i][predicted[batch_i]].item() == 1) lenient_predicted += (predicted[batch_i].item() in movie_dict[indices[batch_i]]) print("Movie Id: %s \tPrediction: %s \tGround Truth: %s" % (indices[batch_i], classes[predicted[batch_i]], classes[torch.argmax(labels[batch_i])])) print('Accuracy of the Multi-Class Classifier on Validation Set: %d %%' % (100 * correct_predicted / total_predicted)) print( 'Lenient Accuracy of the Multi-Class Classifier on Validation Set %d %%' % (100 * lenient_predicted / total_predicted)) #classes = ut.unpickle(CLASS_INDECES_RAW_PATH) class_correct = list(0. for i in range(len(classes))) class_total = list(0. for i in range(len(classes))) with torch.no_grad(): for data in val_loader: imgs, labels, indices = data imgs = Variable(imgs.type(FloatTensor)).to(device) labels = torch.stack(labels) labels = torch.transpose(labels, 0, 1) labels = Variable(labels).to(device) outputs = MC(imgs) _, predicted = torch.max(outputs, 1) ''' c = (labels[predicted] == 1).squeeze() for i in range(len(labels)): label = labels[i] class_correct[label] += c[i].item() class_total[label] += 1 ''' for batch_i in range(predicted.size(0)): label = np.argmax(labels[batch_i]) class_total[label] += 1 class_correct[label] += ( labels[batch_i][predicted[batch_i]].item() == 1) for i in range(len(classes)): if class_total[i] == 0: print('Accuracy of %5s : N/A' % (classes[i])) else: print('Accuracy of %5s : %2d %%' % (classes[i], 100 * class_correct[i] / class_total[i]))
from util import plot_array_image DATA_PATH = '/home/snwiz/data/imagenet12/code/data' num_class = 1 MEAN_FILE_EXT = "_mean" if num_class == 1000: INPUT_FILE = DATA_PATH + "/imagenet_data" else: INPUT_FILE = DATA_PATH + "/imagenet_data_tiny" + str(int(num_class) ) OUTPUT_FILE = INPUT_FILE + MEAN_FILE_EXT IMAGE_SIZE = 256 VERIFY_RESULT = True #VERIFY_RESULT = False out = unpickle( INPUT_FILE ); num = 0; m = n.zeros( (IMAGE_SIZE, IMAGE_SIZE,3), n.float32 ) if VERIFY_RESULT: sum_m = n.zeros( (IMAGE_SIZE, IMAGE_SIZE,3), n.float32 ) for cls_index in range( num_class ): num_cls_index = len(out['index_map_train'][cls_index]) for index in range( num_cls_index ): i = out['index_map'][cls_index][index] image_path = out['image_path'][i] im = Image.open( image_path ) #if cls_index == 6 and index == 0: # im.show() # import pdb; pdb.set_trace()
#code to tag words based on BILOU from sklearn.svm import LinearSVC import util import nltk.classify import nltk from nltk import word_tokenize, sent_tokenize import parseTrial from collections import defaultdict import re from nltk.stem import PorterStemmer from nltk.corpus import stopwords import jsonrpc from simplejson import loads classifier = util.unpickle("trained_tagger.pickle") server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080))) def initializeModules(): ps = PorterStemmer() pattern = re.compile("[?!.-;:]+") commaPattern = re.compile("[,]+") return [ps, pattern, commaPattern] #Get the tokenized form of the sentence, the postags of the sentence and the parse tree def getSentenceFeatures(line): line = line.strip() wordTokenized = word_tokenize(line)
#code to tag words based on BILOU from sklearn.svm import LinearSVC import util import nltk.classify import nltk from nltk import word_tokenize,sent_tokenize import parseTrial from collections import defaultdict import re from nltk.stem import PorterStemmer from nltk.corpus import stopwords import jsonrpc from simplejson import loads classifier = util.unpickle("trained_tagger.pickle") server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080))) def initializeModules(): ps = PorterStemmer() pattern = re.compile("[?!.-;:]+") commaPattern = re.compile("[,]+") return [ps,pattern,commaPattern] #Get the tokenized form of the sentence, the postags of the sentence and the parse tree def getSentenceFeatures(line): line = line.strip() wordTokenized = word_tokenize(line) posTags = util.runPOS(line)
def load_model(self): if self.x_test is None: self.load_features() self.model = unpickle(self.parameters["pickle_model"])
def __init__(self, data_path): dataset = util.unpickle(data_path) self.data = dataset['data'] self.ids = list(self.data.keys())
from util import plot_array_image DATA_PATH = '/home/snwiz/data/imagenet12/code/data' num_class = 1 MEAN_FILE_EXT = "_mean" if num_class == 1000: INPUT_FILE = DATA_PATH + "/imagenet_data" else: INPUT_FILE = DATA_PATH + "/imagenet_data_tiny" + str(int(num_class)) OUTPUT_FILE = INPUT_FILE + MEAN_FILE_EXT IMAGE_SIZE = 256 VERIFY_RESULT = True #VERIFY_RESULT = False out = unpickle(INPUT_FILE) num = 0 m = n.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), n.float32) if VERIFY_RESULT: sum_m = n.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), n.float32) for cls_index in range(num_class): num_cls_index = len(out['index_map_train'][cls_index]) for index in range(num_cls_index): i = out['index_map'][cls_index][index] image_path = out['image_path'][i] im = Image.open(image_path) #if cls_index == 6 and index == 0: # im.show() # import pdb; pdb.set_trace()
def __init__( self, raw_image_queue, # shared Queue to store raw image data, # data file contians image path information mean_file, # mean file root_path, # root path of images data_mode, # 'all','train','val' batch_size=128, # size of batch batch_index=0, # start batch index epoch_index=1 # start epoch index ): threading.Thread.__init__(self, name="Load Image Thread") self.stop = False self.sharedata = raw_image_queue self.data = data self.num_classes = len(self.data['val']) self.data_mode = data_mode self.root_path = root_path if data_mode == "val": self.images = self.data['val'] self.total_samples = self.data['num_data_val'] self.shuffle = False print 'Validation data is not randomized' elif data_mode == "train": self.images = self.data['train'] self.total_samples = self.data['num_data_train'] self.shuffle = False #self.shuffle = True print 'Traing data shuffle: ', self.shuffle else: print "data_mode: " + str(data_mode) + " not valid" import pdb pdb.set_trace() sys.exit(1) # iterator on classes self.iclass = -1 # iterator for samples of each class self.isamples = self.num_classes * [-1] # class_iter = range(num_classes) # if shuffle: random.shuffle(class_iter) # classes_iter = [] # for i in range(num_classes): # classes_iter.append(range(len(images[i]))) # if shuffle: random.shuffle(classes_iter[i]) # # get batch queue # self.batch_queue = [] # has_add = True # while has_add: # has_add = False # for i in range( self.num_classes ): # if len(index_map[i]) > 0: # index = index_map[i].pop() # self.batch_queue.append( index ) # has_add = True # self.num_images = len( self.batch_queue ) #init current index and batch size self.total_processed = 0 self.batch_size = batch_size self.batch_index = batch_index self.epoch_index = epoch_index # read data mean from file data_mean_file = unpickle(mean_file) self.data_mean = data_mean_file['data'] self.data_mean = self.data_mean.astype(n.float32) # store it as uint8 #self.data_mean = n.round( self.data_mean).astype(n.uint8) print data_mode + ': total_samples: ' + str(self.total_samples) \ + ' batch_size: ' + str(batch_size) \ + ' num_batches: ' + str(self.get_num_batches())
def makeb(name, shape, params=None): foo=u.unpickle('/data/t-maoqu/convnets/netsaves/imnet21841aggregated/7.1500') #foo=u.unpickle('/home/NORTHAMERICA/t-maoqu/share/net/47.250') for i in foo['model_state']['layers']: if i['name']==name: return i['biases']
def __init__(self, word2vec_file): d = unpickle(word2vec_file) self._vocab = d['vocab'] self._word_vectors = d['word_vectors'] self._create_index_dict(d['word2idx'])
from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import util import nltk.classify featuresSet = util.unpickle("featuresSet_best.txt") classifier = nltk.classify.SklearnClassifier(LinearSVC()) classifier.train(featuresSet) util.writePickle(classifier,"trained_tagger.pickle")
def _ensemble(self, label: str) -> List[Stats]: active_learning_data = self._active_learning_data_split(label) stats = self._initialize_stats(label, 'VotingClassifier', 'entropy_sampling') file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path( label, VotingClassifier([]), entropy_sampling) if os.path.exists(file_path_pkl): if self.verbose: print('Available, retrieving...') return [util.unpickle(file_path_pkl)] # supervised # active learners rf, _, _ = self._active_learning_initial_training( False, stats, [], self.active_learning_rf, entropy_sampling, active_learning_data, []) lr, _, _ = self._active_learning_initial_training( False, stats, [], self.active_learning_lr, entropy_sampling, active_learning_data, []) gb, _, _ = self._active_learning_initial_training( False, stats, [], self.active_learning_gb, entropy_sampling, active_learning_data, []) # semi-supervised: label propagation # labeled_indices = [] # lp, _, _ = self._active_learning_initial_training(True, stats, [], None, entropy_sampling, # active_learning_data, labeled_indices) # unsupervised prevalence = len(active_learning_data.y_train_start[ active_learning_data.y_train_start == True]) / len( active_learning_data.y_train_start) iforest = IsolationForest(contamination=prevalence, behaviour='new', n_estimators=self.clf_n_estimator) x = pd.concat([ active_learning_data.x_train_start, active_learning_data.x_train_pool ]).reset_index(drop=True) iforest.fit(x) predictions = self._ensemble_predictions(rf, lr, iforest, gb, active_learning_data) metrics = self._get_metrics(active_learning_data.y_dev, predictions) data_for_plotting = [self._get_plotting_row(-1, metrics, 0, 0)] metrics = util.add_prefix_to_dict_keys(metrics, 'initial_') stats = util.merge_dicts(stats, metrics) for i in range(self.active_learning_budget): rf, _, _ = self._active_learning_single_query_supervised( rf, active_learning_data) lr, _, _ = self._active_learning_single_query_supervised( lr, active_learning_data) gb, _, _ = self._active_learning_single_query_supervised( gb, active_learning_data) # lp, _, _ = self._active_learning_single_query_semi_sup(lp, labeled_indices, active_learning_data, # entropy_sampling) predictions = self._ensemble_predictions(rf, lr, iforest, gb, active_learning_data) metrics = self._get_metrics(active_learning_data.y_dev, predictions) data_for_plotting.append(self._get_plotting_row(i, metrics, 0, 0)) if i + 1 in self.active_learning_log_intervals: metrics = util.add_prefix_to_dict_keys(metrics, f'sample_{i+1}_') stats = util.merge_dicts(stats, metrics) util.pickle_object(stats, file_path_pkl) util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv) return [stats]
label_names = [ 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ] labels_dict = {i: x for i, x in enumerate(label_names)} writer = csv.writer(open(output_file, 'wb')) writer.writerow(['id', 'label']) counter = 1 ### # 0th net # result['labels'] # result['preds'] result = unpickle(sys.argv[1]) num_batches = len(result['labels']) for ii in range(num_nets - 1): result_ii = unpickle(sys.argv[ii + 2]) # check num of batches is consistant num_batches_ii = len(result_ii['labels']) for jj in range(num_batches): # check label is consistant assert (np.array_equal(result_ii['labels'][jj], result['labels'][jj])) # nc result['pred'][jj] result['preds'][jj] += result_ii['preds'][jj] #print len( result['preds'] ) #pickle( output_file, result )
def load(datadic, ind, filename): datadic[ind] = unpickle(filename)
def cifar100toimages(): file = r'D:\work\sunxiuyu\cifar-100-python\test' outfolder = r'D:\work\sunxiuyu\cifar-10-py-colmajor\tmp' dict = util.unpickle(file) numlcass = np.array(dict['fine_labels'], np.int).max() fine_classes = {} mean = np.zeros(32 * 32 * 3, dtype=np.double) for i in range(0, len(dict['data'])): mean += dict['data'][i] mean = mean / len(dict['data']) metafile = r'D:\work\sunxiuyu\cifar-10-py-colmajor\cifar10\batches.meta' outmetafile = r'D:\work\sunxiuyu\cifar-10-py-colmajor\cifar-100\batches.meta' meta = util.unpickle(metafile) for key in meta: print key meta['label_names'] = [str(i) for i in range(0, 100)] meta['data_mean'] = meta['data_mean'].reshape(3072) meta['data_mean'][0:1024] = mean[2048:3072] #b meta['data_mean'][1024:2048] = mean[1024:2048] #g meta['data_mean'][2048:3072] = mean[0:1024] #r util.pickle(outmetafile, meta) return for i in range(0, len(dict['data'])): fine_classes_idx = dict['fine_labels'][i] #m_data = dict['data'][i] - mean m_data = dict['data'][i] if fine_classes_idx in fine_classes: fine_classes[fine_classes_idx].append({ 'data': m_data, 'label': dict['fine_labels'][i], 'filename': dict['filenames'][i] }) else: fine_classes[fine_classes_idx] = [{ 'data': m_data, 'label': dict['fine_labels'][i], 'filename': dict['filenames'][i] }] pass # random shuffle for i in range(0, len(fine_classes)): indexs = range(0, len(fine_classes[i])) np.random.shuffle(indexs) fine_classes[i] = [fine_classes[i][x] for x in indexs] #save image patches if not os.path.exists(outfolder): os.makedirs(outfolder) for i in range(0, len(fine_classes)): class_folder = os.path.join(outfolder, str(i)) if not os.path.exists(class_folder): os.makedirs(class_folder) for image in fine_classes[i]: #save image patch m_data = image['data'] r = m_data[0:1024] g = m_data[1024:2048] b = m_data[2048:3072] r = r.reshape(32, 32) g = g.reshape(32, 32) b = b.reshape(32, 32) color_img = np.zeros((32, 32, 3), dtype=np.uint8) color_img[:, :, 0] = b[:, :] color_img[:, :, 1] = g[:, :] color_img[:, :, 2] = r[:, :] imagefile = os.path.join(class_folder, image['filename']) cv2.imwrite(imagefile, color_img)
def _learn_anomalies(self, label: str) -> List[Stats]: data_for_plotting = [] learner = self.active_learning_rf # TERRIBLE hack to pass the name of the sampling strategy def iforest_sampling(contamination): return IsolationForest(contamination=contamination, n_estimators=self.clf_n_estimator, behaviour='new', n_jobs=-1, random_state=self.random_seed) file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path( label, learner, iforest_sampling) if os.path.exists(file_path_pkl): if self.verbose: print('Available, retrieving...') return util.unpickle(file_path_pkl) active_learning_data = self._active_learning_data_split(label) x_dev = active_learning_data.x_dev x_train_start = active_learning_data.x_train_start y_train_start = active_learning_data.y_train_start # initialize stats stats = self._initialize_stats(label, learner_name, sampling_strategy_name) def update_stats(stats_, data_for_plotting_, prefix, x_, y_, i_): learner.fit(x_, y_) predicted = learner.predict(x_dev) scores = learner.predict_proba(x_dev)[:, 1] metrics = self._get_metrics(actual=active_learning_data.y_dev, predicted=predicted, scores=scores) data_for_plotting_.append( self._get_plotting_row(i_, metrics, elapsed_train=0, elapsed_query=0)) metrics = util.add_prefix_to_dict_keys(metrics, prefix) stats_ = util.merge_dicts(stats_, metrics) return stats_, data_for_plotting_ # initial training stats, data_for_plotting = update_stats(stats, data_for_plotting, 'initial_', x_train_start, y_train_start, -1) # isolation forest y_train_start = active_learning_data.y_train_start prevalence = len( y_train_start[y_train_start == True]) / len(y_train_start) iforest = iforest_sampling(prevalence) iforest.fit( pd.concat([ active_learning_data.x_train_start, active_learning_data.x_train_pool ])) anomaly_indices_sorted = np.argsort( iforest.score_samples(active_learning_data.x_train_pool)) # pretend that we're active learning # however we're just going through a sorted list of anomalies (starting with the most anomalous) for i in range(self.active_learning_budget): x_extra = active_learning_data.x_train_pool.iloc[ anomaly_indices_sorted[:i + 1]] y_extra = active_learning_data.y_train_pool.iloc[ anomaly_indices_sorted[:i + 1]] x = pd.concat([active_learning_data.x_train_start, x_extra]) y = pd.concat([active_learning_data.y_train_start, y_extra]) stats, data_for_plotting = update_stats(stats, data_for_plotting, f'sample_{i+1}_', x, y, i) # persist the results util.pickle_object(stats, file_path_pkl) util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv) return [stats]
def send(players, recipient='all'): pub_socket.send(recipient + ':' + util.pickle(players)) player_list = [] player_details = {} while True: time.sleep(0.016) send(player_details) try: msg = pull_socket.recv_pyobj(flags=zmq.core.NOBLOCK) except zmq.core.error.ZMQError: pass else: if msg['cmd'] == 'new': player = util.unpickle(msg['player']) print 'new player %s' % player.name player_details[player.name] = player elif msg['cmd'] == 'update': player = util.unpickle(msg['player']) del player_details[player.name] player_details[player.name] = player elif msg['cmd'] == 'update_stop': player = util.unpickle(msg['player']) del player_details[player.name] player_details[player.name] = player
def load(self, batch_idx): print "loading batch_idx: ", batch_idx, " batch: ", self.batch_range[batch_idx] self.datadic[batch_idx] = unpickle(self.data_dir + "data_batch_" + str(self.batch_range[batch_idx]))
def main(): """ Main function """ # Parameters train_data_size = 40000 batch_size = 64 epoch = 30 step_size = train_data_size // batch_size display_size = step_size // 2 learning_rate = 0.001 train_filename = 'cifar-100-python/train' test_filename = 'cifar-100-python/test' text_label_filename = 'cifar-100-python/meta' # 1. Read training and test data. print("Reading training and test data files...") train_data = util.unpickle(train_filename) test_data = util.unpickle(test_filename) text_labels = util.unpickle(text_label_filename) raw_train_images, raw_train_label = util.split_images_and_labels(train_data) raw_test_images, test_labels = util.split_images_and_labels(test_data) fine_label_names, coarse_label_names = util.split_labels(text_labels) # Construct fine class to superclass mapping superclass_mapping = util.construct_superclass_mapping( train_data[b'fine_labels'], train_data[b'coarse_labels']) fine_labels = util.decode_binary_text(fine_label_names) coarse_labels = util.decode_binary_text(coarse_label_names) # 2. Pre-process the data # Calculate mean image using train data and subtract all images from it print("Pre-processing the data...") # Cast to float type first TODO and maybe normalize the data raw_train_images_float = raw_train_images.astype(float) raw_test_images_float = raw_test_images.astype(float) mean_image = raw_train_images[:train_data_size].sum(axis=0) / train_data_size raw_train_images_float -= mean_image raw_test_images_float -= mean_image # Format that we want (-1, 3, 32, 32) formatted_train_images = util.format_data(raw_train_images_float) test_images = util.format_data(raw_test_images_float) # 3. Split the train images and labels into train and validation set train_images, train_labels, vali_images, vali_labels = util.split_train_and_validation( formatted_train_images, raw_train_label, train_data_size) vali_super_labels = np.array(train_data[b'coarse_labels'][train_data_size:]) test_super_labels = np.array(test_data[b'coarse_labels']) # 4. Construct the graph print("Constructing the graph...") # Inputs x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y_real = tf.placeholder(tf.int64, shape=(None,)) augmentation = tf.map_fn(tf.image.random_flip_up_down, x) # Outputs, cross entropy calculation, and optimizer y_predict = util.lenet_5(x) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y_predict, labels=y_real) loss_op = tf.reduce_mean(cross_entropy) train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op) # 5. Calculate confusion matrix and accuracy (batch and validation) # Fine class label prediction: confusion matrix, correct predictions, and accuracy labels_predicted = tf.argmax(y_predict, 1) confusion_matrix_fine = tf.confusion_matrix(y_real, labels_predicted) correct_prediction = tf.equal(labels_predicted, y_real) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) correct_top_5 = tf.nn.in_top_k(y_predict, y_real, 5) accuracy_fine_top_5 = tf.reduce_mean(tf.cast(correct_top_5, tf.float32)) # Super class label prediction: confusion matrix, correct predictions, and accuracy mapped_labels = tf.placeholder(tf.int64, shape=(None,)) confusion_matrix_super = tf.confusion_matrix(y_real, mapped_labels) correct_prediction_super = tf.equal(mapped_labels, y_real) accuracy_super = tf.reduce_mean(tf.cast(correct_prediction_super, tf.float32)) top_5_labels = tf.nn.top_k(y_predict, 5) # Add results to summaries loss_summary = tf.summary.scalar('Loss', loss_op) accuracy_summary = tf.summary.scalar('Accuracy: Fine Labels', accuracy) accuracy_summary_super = tf.summary.scalar('Accuracy: Super Labels', accuracy_super) # 6. Start the training print("Start training...") total_steps = 0 # count the number of steps it takes throughout training with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter('logs' + '/train', sess.graph) vali_writer = tf.summary.FileWriter('logs' + '/validation', sess.graph) for epoch_count in range(1, epoch + 1): for step_count in range(1, step_size + 1): total_steps += 1 # Input training data train_batch, label_batch = util.get_random_batch(train_images, train_labels, batch_size) #train_batch = sess.run(augmentation, feed_dict={x: train_batch}) sess.run(train_op, feed_dict={x: train_batch, y_real: label_batch}) train_loss_summary, train_acc_summary, loss, acc = sess.run( [loss_summary, accuracy_summary, loss_op, accuracy], feed_dict={x: train_batch, y_real: label_batch}) train_writer.add_summary(train_loss_summary, total_steps) train_writer.add_summary(train_acc_summary, total_steps) # Print validation accuracy every so often if step_count % display_size == 0: # Validation: fine class label accuracy measurement vali_acc_fine = sess.run(accuracy, feed_dict={x: vali_images, y_real: vali_labels}) vali_acc_summary_fine = sess.run(accuracy_summary, feed_dict={x: vali_images, y_real: vali_labels}) vali_writer.add_summary(vali_acc_summary_fine, total_steps) print('Validation fine label accuracy: {:.5f}'.format(vali_acc_fine)) # Validation: super class label accuracy measurement labels = sess.run(labels_predicted, feed_dict={x: vali_images, y_real: vali_super_labels}) new_labels = util.map_class(labels, superclass_mapping) vali_acc_super = sess.run(accuracy_super, feed_dict={x: vali_images, y_real: vali_super_labels, mapped_labels: new_labels}) vali_acc_summary_super = sess.run(accuracy_summary_super, feed_dict={x: vali_images, y_real: vali_super_labels, mapped_labels: new_labels}) vali_writer.add_summary(vali_acc_summary_super, total_steps) print('Number of steps: {}'.format(total_steps)) print('Validation super label accuracy: {:.5f}'.format(vali_acc_super)) print("{} epochs finished".format(epoch_count)) # Test: Fine class label accuracy measurement and confusion matrix test_acc_fine_top_1 = sess.run(accuracy, feed_dict={x: test_images, y_real: test_labels}) test_acc_fine_top_5 = sess.run(accuracy_fine_top_5, feed_dict={x: test_images, y_real: test_labels}) con_matrix_fine = sess.run(confusion_matrix_fine, feed_dict={x: test_images, y_real: test_labels}) # Display the first ten images first_ten_predictions = sess.run(labels_predicted, feed_dict={x: test_images, y_real: test_labels})[:10] prediction_text_labels = util.map_text_labels(first_ten_predictions, fine_labels) true_text_labels = util.map_text_labels(test_labels[:10], fine_labels) print(prediction_text_labels) print(true_text_labels) first_ten_images = util.format_data(raw_test_images[:10]) display_image = util.combine_ten_images(first_ten_images) plt.imshow(display_image) plt.savefig("Result", bbox_inches='tight') # Test: Super class label accuracy measurement labels = sess.run(labels_predicted, feed_dict={x: test_images, y_real: test_super_labels}) new_labels = util.map_class(labels, superclass_mapping) test_acc_super = sess.run(accuracy_super, feed_dict={x: test_images, y_real: test_super_labels, mapped_labels: new_labels}) con_matrix_super = sess.run(confusion_matrix_super, feed_dict={x: test_images, y_real: test_super_labels, mapped_labels: new_labels}) top_5_labels = sess.run(top_5_labels, feed_dict={x: test_images})[1] util.map_all_classes(top_5_labels, superclass_mapping) correctness_test_top_5 = util.correct_in_top_5_super(top_5_labels, test_super_labels) test_acc_top_5_super = sum(correctness_test_top_5) / len(correctness_test_top_5) # Save our result output_result( [ 'Number of steps taken: {}\n'.format(total_steps), 'Test fine label accuracy (top 1): {:.5f}\n'.format(test_acc_fine_top_1), 'Test fine label accuracy (top 5): {:.5f}\n'.format(test_acc_fine_top_5), 'Test super label accuracy (top 1): {:.5f}\n'.format(test_acc_super), 'Test super label accuracy (top 5): {:.5f}\n'.format(test_acc_top_5_super) ] ) save_confusion_matrix(con_matrix_fine, 'Fine-Label-Confusion-Matrix.txt') save_confusion_matrix(con_matrix_super, 'Super-Label-Confusion-Matrix.txt') save_heatmap(con_matrix_fine, 'Heatmap-fine-label') save_heatmap(con_matrix_super, 'Heatmap-super-label') print("Training finished!")
def __init__( self, raw_image_queue, # shared Queue to store raw image data, # data file contians image path information mean_file, # mean file root_path, # root path of images data_mode, # 'all','train','val' batch_size = 128, # size of batch batch_index = 0, # start batch index epoch_index = 1 # start epoch index ): threading.Thread.__init__( self, name = "Load Image Thread" ) self.stop = False self.sharedata = raw_image_queue self.data = data self.num_classes = len(self.data['val']) self.data_mode = data_mode self.root_path = root_path if data_mode == "val": self.images = self.data['val'] self.total_samples = self.data['num_data_val'] self.shuffle = False print 'Validation data is not randomized' elif data_mode == "train": self.images = self.data['train'] self.total_samples = self.data['num_data_train'] self.shuffle = False #self.shuffle = True print 'Traing data shuffle: ', self.shuffle else: print "data_mode: " + str(data_mode) + " not valid" import pdb; pdb.set_trace() sys.exit(1) # iterator on classes self.iclass = -1 # iterator for samples of each class self.isamples = self.num_classes * [-1] # class_iter = range(num_classes) # if shuffle: random.shuffle(class_iter) # classes_iter = [] # for i in range(num_classes): # classes_iter.append(range(len(images[i]))) # if shuffle: random.shuffle(classes_iter[i]) # # get batch queue # self.batch_queue = [] # has_add = True # while has_add: # has_add = False # for i in range( self.num_classes ): # if len(index_map[i]) > 0: # index = index_map[i].pop() # self.batch_queue.append( index ) # has_add = True # self.num_images = len( self.batch_queue ) #init current index and batch size self.total_processed = 0 self.batch_size = batch_size self.batch_index = batch_index self.epoch_index = epoch_index # read data mean from file data_mean_file = unpickle(mean_file) self.data_mean = data_mean_file['data'] self.data_mean = self.data_mean.astype(n.float32) # store it as uint8 #self.data_mean = n.round( self.data_mean).astype(n.uint8) print data_mode + ': total_samples: ' + str(self.total_samples) \ + ' batch_size: ' + str(batch_size) \ + ' num_batches: ' + str(self.get_num_batches())
from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import util import nltk.classify featuresSet = util.unpickle("featuresSet_best.txt") classifier = nltk.classify.SklearnClassifier(LinearSVC()) classifier.train(featuresSet) util.writePickle(classifier, "trained_tagger.pickle")
'save_dir', '/work/cse496dl/cpack/Assignment_3/models/2/maxcompression_encoder_homework_3-0', 'directory where VAE model graph and weights are saved') flags.DEFINE_integer('batch_size', 250, '') flags.DEFINE_integer('latent_size', 32, '') flags.DEFINE_integer('max_epoch', 100, '') flags.DEFINE_integer('early_stop', 15, '') FLAGS = flags.FLAGS ############# # CIFAR 100 # ############# cifar100_test = {} cifar100_train = {} # Load the raw CIFAR-100 data. cifar100_test = util.unpickle(FLAGS.data_dir + 'cifar-100-python/test') cifar100_train = util.unpickle(FLAGS.data_dir + 'cifar-100-python/train') train_data = cifar100_train[b'data'] test_data = cifar100_test[b'data'] train_data = np.reshape(train_data, (50000, 3, 32, 32)).transpose(0, 2, 3, 1).astype(float) test_data = np.reshape(test_data, (10000, 3, 32, 32)).transpose(0, 2, 3, 1).astype(float) ################ # VAE Modeling # ################ img_shape = [32, 32, 3] tf.reset_default_graph()