def main(training_from_scratch, args):

    if (training_from_scratch):

        text = open(args.filename, 'rb').read().decode(encoding='utf-8')
        text, char2idx, idx2char = preprocessing(
            text, args.checkpoint_dir,
            args.minocc)  # note that we are replacing the text here

        vocab_size = len(idx2char)
        config = Config(vocab_size, args.epochs)

        model = build_model(config)
    else:

        model = tf.keras.models.load_model(args.checkpoint)
        char2idx = unpickle(args.checkpoint_dir, 'char2idx')
        idx2char = unpickle(args.checkpoint_dir, 'idx2char')
        text = unpickle(args.checkpoint_dir, 'dataset')

        vocab_size = len(idx2char)
        config = Config(vocab_size, args.epochs, args.initepochs)

    text_as_int = np.array([char2idx[c] for c in text
                            ])  # works because text is a list of words
    train_model(args.checkpoint_dir, text_as_int, model, config)
Пример #2
0
def gen_dataset(N, data_path, isMC=True):
    data_path = os.path.abspath(data_path)
    ut.create_dir(data_path)

    if (isMC):
        movie_dict = ut.unpickle(
            os.path.join("data", "movies_single_index.pkl"))
    else:
        movie_dict = ut.unpickle(os.path.join(
            "data",
            "movies_multi_index.pkl"))  #Need to make this new index thing

    full_ids = list(movie_dict.keys())
    movie_ids = random.sample(full_ids, N)

    ids_l = len(movie_ids)
    shuffle(movie_ids)
    train_ids = movie_ids[:int(ids_l * SPLIT[0])]
    val_ids = movie_ids[int(ids_l * SPLIT[0]):int(ids_l *
                                                  (SPLIT[0] + SPLIT[1]))]
    test_ids = movie_ids[int(ids_l * (SPLIT[0] + SPLIT[1])):-1]

    print("Generating Training Set...")
    generate_set(train_ids, movie_dict, data_path, "train", isMC)
    print("Generating Validation Set...")
    generate_set(val_ids, movie_dict, data_path, "val", isMC)
    print("Generating Test Set...")
    generate_set(test_ids, movie_dict, data_path, "test", isMC)
def prepare_for_rec(out_dir):

    meta = {}

    meanImg = readAndResize('/database/test.jpg',SIZE, CHANNELS)
    meanImgstd = readAndResize('/database/test.jpg', SIZE_STD, CHANNELS)

    meta['data_mean'] = meanImg
    meta['data_mean_std'] = meanImgstd
    util.pickle(os.path.join(out_dir,"batches.meta"), meta)
    

    trainingMeta = util.unpickle(os.path.join(out_dir, "trainingMeta.meta"))

    testMeta = util.unpickle(os.path.join(out_dir, "testMeta.meta"))

    print "prepare for training"
    random.shuffle(trainingMeta)
    make_list_batches(trainingMeta,out_dir,NUM_PER_PATCH)



    #for test
    print "prepare for test"
    random.shuffle(testMeta)
    make_list_batches(trainingMeta,out_dir,NUM_PER_PATCH,8000)
	def __init__(self):
		# self.classifier = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/cl_gun.txt")
		# self.featurenum = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/featurenum_gun.txt")
		print "Learner initialized for presidential_elections"
		self.classifier = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/cl_immigration.txt")
		self.featurenum = util.unpickle(os.path.dirname(os.path.realpath(__file__)) + "/featurenum_immigration.txt")
		print "Learner initialized"
Пример #5
0
    def __init__(self,
                 data_file,
                 root_path,
                 data_mode="train",
                 random_transform=False,
                 batch_size=128,
                 crop_width=224,
                 crop_height=224):
        # read image-name image-index map from file
        self.data = unpickle(data_file)
        self.num_classes = len(self.data['index_map'])
        self.data_mode = data_mode
        self.random_transform = random_transform
        self.root_path = root_path
        if data_mode == "all":
            index_map = self.data['index_map']
        elif data_mode == "val":
            index_map = self.data['index_map_val']
        elif data_mode == "train":
            index_map = self.data['index_map_train']
        else:
            print "data_mode: " + data_mode + " not valid"
            import pdb
            pdb.set_trace()
            sys.exit(1)

        # get batch queue
        self.batch_queue = []
        has_add = True
        while has_add:
            has_add = False
            for i in range(self.num_classes):
                if len(index_map[i]) > 0:
                    index = index_map[i].pop()
                    self.batch_queue.append(index)
                    has_add = True

        self.num_images = len(self.batch_queue)

        #init current index and batch size
        self.batch_size = batch_size
        self.prev_batch_size = batch_size
        self.crop_width = crop_width
        self.crop_height = crop_height
        self.batch_index = 1
        self.epoch = 1

        # read data mean from file
        data_mean_file = unpickle(data_file + MEAN_FILE_EXT)
        self.data_mean = data_mean_file['data']
def prepareTrain(folderCls, imgStdCls, meanImg_dir, out_dir):
    global NUM_PER_BATCH
    
    meanImg = util.unpickle(meanImg_dir + '/meanImg')
    meanImgStd = util.unpickle(meanImg_dir + '/meanImgStd')
    allImgMeta = collectAndShuffle(folderCls, imgStdCls)
    meta = {}
    meta['data_mean'] = meanImg
    meta['data_mean_std'] = meanImgStd
    util.pickle( os.path.join(out_dir, "batches.meta"), meta)

    makeBatches(allImgMeta, out_dir, NUM_PER_BATCH)
    out_file = out_dir + "/imglist"
    util.pickle(out_file, allImgMeta)
Пример #7
0
def prepareTrain(train_dir, stdImgfolder, out_dir, meanImg_dir, startIdx):
    global NUM_PER_BATCH


    meanImg = util.unpickle(meanImg_dir + '/meanImg')
    meanImgStd = util.unpickle(meanImg_dir + '/meanImgStd')
    allImgMeta, allLabels = collectAndShuffle(train_dir, stdImgfolder)
    meta = {}
    meta['data_mean'] = meanImg
    meta['data_mean_std'] = meanImgStd
    util.pickle( os.path.join(out_dir, "batches.meta"), meta)

    makeBatches(allImgMeta, out_dir, NUM_PER_BATCH, startIdx)
    out_file = out_dir + "/imglist"
    util.pickle(out_file, allImgMeta)
Пример #8
0
def processTest(test_list, out_dir, startIdx):
    global NUM_PER_PATCH
    meta = util.unpickle(os.path.join(out_dir, "batches.meta"))
    allLabels = meta['label_names']


    fileList = open(test_list,'rb').readlines()
    random.shuffle(fileList)

    print "####### Got %d classes ######" % len(allLabels)
    print "####### Got %d images ######" % len(fileList)

    numImg = len(fileList)
    numBatches = numImg / NUM_PER_PATCH # the last batch keep the remainder
    if numImg % NUM_PER_PATCH != 0:
        numBatches += 1

    print 'Going to make %d baches' % numBatches
    for idx_batch in range(numBatches):
        #        if idx_batch < numBatches - 2:
        #            continue
        print "### Making the %dth batch ###" % idx_batch
        b_start = NUM_PER_PATCH * idx_batch
        b_end = NUM_PER_PATCH * (idx_batch + 1)
        if idx_batch == numBatches - 1:
            b_start = numImg - NUM_PER_PATCH
            b_end = numImg
        batchMeta = fileList[b_start:b_end]

        data, labels, imgnames = getBatch(batchMeta,allLabels)
        out_fname = os.path.join(out_dir, "data_batch_%04d" % (idx_batch+startIdx))
        print "saving to %s" % out_fname
        util.pickle(out_fname, {'data':data, 'labels':labels, 'images':imgnames})
def processTest(test_dir, out_dir, startIdx):
    global NUM_PER_PATCH

    out_file = out_dir + "/imglist"
    trsimgMeta, trsLabels = util.unpickle(out_file)



    allImgMeta = []
    allLabels = []
    subdirnames = os.listdir(test_dir)
    list.sort(subdirnames)
    #for classLabel, subdir in enumerate(trsLabels):
    for num,test_sub_dir in enumerate(subdirnames):
        label = trsLabels.index(test_sub_dir)
        imgnames = os.listdir(os.path.join(test_dir, test_sub_dir))
        fullnames = [os.path.join(test_dir, test_sub_dir, name) for name in imgnames]
        meta = zip(fullnames, [label] * len(fullnames))
        allImgMeta += meta

    print "####### Got %d classes ######" % len(subdirnames)
    print "####### Got %d images ######" % len(allImgMeta)
    print "shuffling..."
    random.shuffle(allImgMeta)

    makeBatches(allImgMeta, out_dir, NUM_PER_PATCH, startIdx)
Пример #10
0
 def __init__(self,
              data_file,
              mean_file,
              root_path,
              data_mode="train",
              batch_index=0,
              epoch_index=1,
              random_transform=False,
              batch_size=128,
              crop_width=224,
              crop_height=224,
              buffer_size=2):
     # init data Q
     self.raw_image_queue = Queue(buffer_size)
     self.batch_data_queue = Queue(buffer_size)
     self.stop = False
     print 'Loading data from ' + str(data_file)
     self.data = unpickle(data_file)
     # init read/tranfrom image object
     self.readImage = ReadImage(self.raw_image_queue, self.data, mean_file,
                                root_path, data_mode, batch_size,
                                batch_index, epoch_index)
     self.processImage = ProcessImage(self.data, self.raw_image_queue,
                                      self.batch_data_queue, crop_width,
                                      crop_height, self.readImage.data_mean,
                                      data_mode, random_transform)
Пример #11
0
def convert_weight_matrix_default(pretrain_file, tf_modelfile):
    print('Loading word embedding...')
    embedding = reduce_word_embedding()
    print('Loading models...')
    d = unpickle(pretrain_file)
    vars = {}
    print('Converting model...')
    for vname in d.keys():
        vars[vname] = d[vname].get_value()
    # slice theano variables
    gWx = vars['encoder_W']  # 620x4800
    gWh = vars['encoder_U']  # 2400x4800
    gb = vars['encoder_b']  # 1x4800
    U = vars['encoder_Ux']  # 2400x2400
    W = vars['encoder_Wx']  # 620x2400
    b = vars['encoder_bx']  # 1x2400
    # arrange to tensorflow format
    gate_w = np.concatenate([gWx, gWh], axis=0)
    gate_b = gb
    candx_w = W
    candu_w = U
    candx_b = b
    weight_dict = {
        'Gates/Linear/Matrix': gate_w,
        'Gates/Linear/Bias': gate_b,
        'CandidateW/Linear/Matrix': candx_w,
        'CandidateW/Linear/Bias': candx_b,
        'CandidateU/Linear/Matrix': candu_w,
        'word_embedding/map': embedding
    }
    print('Saving...')
    np.save(tf_modelfile, weight_dict)
    print('Model converted successfully!')
Пример #12
0
 def __init__(self, data_path, split):
     dataset = util.unpickle(data_path)
     self.data = dataset['data']
     self.class_mapping = dataset[
         'class_mapping']  # Maps raw class to mapped class
     self.inv_class_mapping = {
         self.class_mapping[c]: c
         for c in self.class_mapping
     }
     self.ids = sorted(list(self.data.keys()))
     random.Random(0).shuffle(self.ids)
     self.split = np.array(split) / np.sum(split)
     self.split_sizes = np.floor(self.split * len(self.ids))
     self.split_index = 0
     self.split_starts = np.insert(np.cumsum(self.split_sizes), 0, [0])
     self.split_starts = [
         int(split_start) for split_start in self.split_starts
     ]
     self.id_sets = []
     for i in range(len(self.split_sizes) - 1):
         self.id_sets.append(
             set(self.ids[self.split_starts[i]:self.split_starts[i + 1]]))
     np.random.seed(0)
     self.noisy_real = np.random.rand(len(self.ids)) * 0.1 + 0.9
     self.noisy_fake = np.random.rand(len(self.ids)) * 0.1
Пример #13
0
 def __init__( self,
               data_file,
               mean_file,
               root_path,
               data_mode = "train",
               batch_index = 0,
               epoch_index = 1,
               random_transform = False,
               batch_size = 128,
               crop_width = 224,
               crop_height = 224,
               buffer_size = 2
               ):
     # init data Q
     self.raw_image_queue = Queue( buffer_size )
     self.batch_data_queue = Queue( buffer_size )
     self.stop = False
     print 'Loading data from ' + str(data_file)
     self.data = unpickle(data_file)
     # init read/tranfrom image object
     self.readImage = ReadImage(self.raw_image_queue, self.data, mean_file, root_path,
             data_mode, batch_size, batch_index, epoch_index )
     self.processImage = ProcessImage(self.data,
        self.raw_image_queue, self.batch_data_queue, crop_width, crop_height,
        self.readImage.data_mean, data_mode, random_transform)
Пример #14
0
def makew(name, idx, shape, params=None):
    foo=u.unpickle('/data/t-maoqu/convnets/netsaves/imnet21841aggregated/7.1500')
    #foo=u.unpickle('/home/NORTHAMERICA/t-maoqu/share/net/47.250')
    if name=='transfer':
        return 0.1*n.ones((shape[0], shape[1]), n.single)
    for i in foo['model_state']['layers']:
        if i['name']==name:
            return i['weights'][idx]
Пример #15
0
 def load_features(self):
     features = unpickle(self.parameters["pickle_features"])
     self.x_train = features['x_train']
     self.x_test = features['x_test']
     self.y_train = features['y_train']
     self.y_test = features['y_test']
     self.x_scaler = features['x_scaler']
     self.parameters = features['parameters']
Пример #16
0
def test():
    from util import unpickle
    import json
    from inference_utils.question_generator_util import SentenceGenerator
    from w2v_answer_encoder import MultiChoiceQuestionManger

    config = MLPConfig()
    model = SequenceMLP(config, phase='test')
    model.build()
    prob = model.prob

    # Load vocabulary
    to_sentence = SentenceGenerator(trainset='trainval')
    # create multiple choice question manger
    mc_manager = MultiChoiceQuestionManger(subset='trainval',
                                           answer_coding='sequence')

    sess = tf.Session()
    # Load model
    ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
    checkpoint_path = ckpt.model_checkpoint_path
    saver = tf.train.Saver()
    saver.restore(sess, checkpoint_path)

    # get data
    result = []
    dataset = unpickle('data/rescore_dev.pkl')
    for itr, datum in enumerate(dataset):
        seq_index, att_mask, label = _process_datum(datum)
        quest_id = datum['quest_id']
        quest = seq_index[0].tolist()
        feed_dict = model.fill_feed_dict([seq_index, att_mask])
        scores = sess.run(prob, feed_dict=feed_dict)
        idx = scores.argmax()
        # parse question and answer
        question = to_sentence.index_to_question([0] + quest)
        mc_ans = mc_manager.get_candidate_answers(quest_id)
        vaq_answer = mc_ans[idx]
        real_answer = mc_ans[label.argmax()]
        # add result
        result.append({u'answer': vaq_answer, u'question_id': quest_id})
        # show results
        if itr % 100 == 0:
            print('============== %d ============' % itr)
            print('question id: %d' % quest_id)
            print('question\t: %s' % question)
            print('answer\t: %s' % real_answer)
            print('VAQ answer\t: %s (%0.2f)' % (vaq_answer, scores[idx]))

    quest_ids = [res[u'question_id'] for res in result]
    # save results
    tf.logging.info('Saving results')
    res_file = 'result/rescore_dev_dev.json'
    json.dump(result, open(res_file, 'w'))
    from vqa_eval import evaluate_model
    acc = evaluate_model(res_file, quest_ids)
    print('Over all accuarcy: %0.2f' % acc)
Пример #17
0
   def __init__( self, data_file, root_path, data_mode = "train", random_transform = False,
           batch_size = 128, crop_width = 224, crop_height = 224 ):
      # read image-name image-index map from file
      self.data = unpickle( data_file )
      self.num_classes = len( self.data['index_map'] )
      self.data_mode = data_mode
      self.random_transform = random_transform
      self.root_path = root_path
      if data_mode == "all":
         index_map = self.data['index_map']
      elif data_mode == "val":
         index_map = self.data['index_map_val']
      elif data_mode == "train":
         index_map = self.data['index_map_train']
      else:
         print "data_mode: " + data_mode + " not valid"
         import pdb; pdb.set_trace()
         sys.exit(1)

      # get batch queue
      self.batch_queue = []
      has_add = True
      while has_add:
         has_add = False
         for i in range( self.num_classes ):
            if len(index_map[i]) > 0:
               index = index_map[i].pop()
               self.batch_queue.append( index )
               has_add = True

      self.num_images = len( self.batch_queue )

      #init current index and batch size
      self.batch_size = batch_size
      self.prev_batch_size = batch_size
      self.crop_width = crop_width
      self.crop_height = crop_height
      self.batch_index = 1
      self.epoch = 1
      
      # read data mean from file
      data_mean_file = unpickle( data_file + MEAN_FILE_EXT )
      self.data_mean = data_mean_file['data']
Пример #18
0
def main():
   num_args = len(sys.argv)
   # load result from file
   num_nets = num_args - 1

   assert( num_nets > 0 )
   errors = []

   # 0th net
   # result['labels']
   # result['preds']
   result = unpickle( sys.argv[1] ) 
   errors.append( evaluate_result( result, sys.argv[1] ) )
   num_batches = len( result['labels'] )

   #import pdb; pdb.set_trace()
   # collet all results
   for ii in range( num_nets - 1 ):
      result_ii = unpickle( sys.argv[ii+2] )
      # evaluate result_ii
      errors.append( evaluate_result( result_ii, sys.argv[ii+2] ) )
      # check num of batches is consistant
      num_batches_ii = len( result_ii['labels'] )
      for jj in range( num_batches ):
         # check label is consistant
         assert( np.array_equal( 
            result_ii['labels'][jj], result['labels'][jj] ) )
         # nc result['pred'][jj]
         result['preds'][jj] += result_ii['preds'][jj]
         
   pickle( 'combine_result', result )

   # classifier mean/std accuracy
   errors = np.array( errors )
   #import pdb; pdb.set_trace()
   print "mean: " , str(100*np.mean( errors )) , " std: " , str(100*(np.std( errors )))
   # evaluate result
   evaluate_result( result, "After combine" )
Пример #19
0
def main(word_level, path_to_model):
    """Run the bot."""
    global update_id
    global lang_model

    model = tf.keras.models.load_model(os.path.join(path_to_model,"checkpoint_release.h5"))

    char2idx = unpickle(path_to_model, 'char2idx')
    idx2char = unpickle(path_to_model, 'idx2char')



    if( word_level ):
        lang_model = WordLanguageModel(model, char2idx, idx2char)        
    else:
        lang_model = CharLanguageModel(model, char2idx, idx2char)

    # Telegram Bot Authorization Token
    bot = telegram.Bot(open("access_token.txt", 'r').read())

    # get the first pending update_id, this is so we can skip over it in case
    # we get an "Unauthorized" exception.
    try:
        update_id = bot.get_updates()[0].update_id
    except IndexError:
        update_id = None

    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    while True:
        try:
            echo(bot)
        except NetworkError:
            sleep(1)
        except Unauthorized:
            # The user has removed or blocked the bot.
            update_id += 1
Пример #20
0
    def recvServer(self):
        try:
            msg = self.sub_socket.recv(flags=zmq.core.NOBLOCK)
        except zmq.core.error.ZMQError:
            pass
        else:
            sep = msg.find(':')
            to = msg[0:sep]
            foreign = util.unpickle(msg[sep+1:])

            for n, obj in foreign.items():
                if n == self.p1.mini.name:
                    del foreign[n]

            self.updatePlayers(foreign)
def prepareTrain(train_dir, out_dir, meanImg_dir):
    global NUM_PER_PATCH
    #train_dir = "/data1/LSVRC2010/train"
    #out_dir = "/data2/ILSVRC2010/train_batches"
    
    meanImg = util.unpickle(meanImg_dir + '/meanImg')
    allImgMeta, allLabels = collectAndShuffle(train_dir)
    meta = {}
    meta['data_mean'] = meanImg
    meta['label_names'] = allLabels
    util.pickle( os.path.join(out_dir, "batches.meta"), meta)

    makeBatches(allImgMeta, out_dir, NUM_PER_PATCH)
    out_file = out_dir + "/imglist"
    util.pickle(out_file, [allImgMeta, allLabels])
Пример #22
0
def weight_from_checkpoint(path, name, idx, shape):
	checkpoint = util.unpickle(path)
	layers = checkpoint['model_state']['layers']
	layer_names = [layer['name'] for layer in layers]
	match = [i for i, layer_name in enumerate(layer_names) if layer_name==name]
	if len(match)>1:
		raise Exception('More than one matching layer found.')
	if len(match)<1:
		raise Exception('No matching layer found.')
	weights = layers[match[0]]['weights'][idx]
	if weights.shape != shape:
		print 'Shape mismatch:'
		print 'yours:', shape
		print 'mine:', weights.shape
	return weights
def getMeanImg(raw_dir):
    global SIZE, CHANNELS
    flist = os.listdir(raw_dir)
    list.sort(flist)
    globalSum = np.zeros(SIZE*SIZE*CHANNELS, dtype=np.float64)
    globalCount = 0
    raw_info = []
    for label, fname in enumerate(flist):
        print "Reading", fname
        data = util.unpickle(raw_dir + '/' + fname)
        dataSum = np.sum(data, axis=0, dtype = np.float64)
        globalSum += dataSum
        globalCount += data.shape[0]
        raw_info.append((fname, label, data.shape[0])) #(name of the label, label value, number of images)
    meanImg = globalSum / globalCount
    return meanImg, raw_info
Пример #24
0
    def __init__(self, root, json, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform

        # self.tokenized_text_list = unpickle('tokenized_bokete_text.pkl')
        self.tokenized_text_list = unpickle('tokenized_text_list_mecab.pkl')
Пример #25
0
 def get_next_batch(self):
     if self.show:
         epoch, batchnum, bidx = LabeledDataProvider.get_next_batch(self)
         datadic = unpickle(self.data_dir+"data_batch_"+str(batchnum))
         datadic['data'] = n.require(datadic['data']/256., requirements='C', dtype=n.single)
         datadic['labels'] = n.require(datadic['labels'].reshape(1,datadic['data'].shape[1]), dtype=n.single, requirements='C')
         return epoch, batchnum, [datadic['data'], datadic['labels']]
         
     else:
         epoch, batchnum, bidx = LabeledDataProvider.get_next_batch(self)
         if self.test:
             datadic = self.data.get_batch(bidx + len(self.data.train_batch_range))
         else:
             datadic = self.data.get_batch(bidx)
         datadic['data'] = n.require(datadic['data']/256., requirements='C', dtype=n.single)
         datadic['labels'] = n.require(datadic['labels'].reshape(1,datadic['data'].shape[1]), dtype=n.single, requirements='C')
         return epoch, batchnum, [datadic['data'], datadic['labels']]
Пример #26
0
    def __init__(self, meta_path, crop_border, multiview_test):
        self.batch_meta = unpickle(meta_path)

        self.border_size = crop_border
        self.num_colors = 3
        self.outer_size = int((self.batch_meta['data_mean'].size / self.num_colors) ** 0.5)
        self.inner_size = self.outer_size - self.border_size*2
        self.multiview = multiview_test
        self.num_views = 5 * 2
        self.data_mult = self.num_views if self.multiview else 1

        self.label_types = self.batch_meta['label_types']
        self.label_names = self.batch_meta['label_names']
        
        self.data_mean = self.batch_meta['data_mean'].reshape((self.num_colors, self.outer_size, self.outer_size))
        self.data_mean = self.data_mean[:,self.border_size:self.border_size+self.inner_size,self.border_size:self.border_size+self.inner_size]
        self.data_mean = self.data_mean.reshape((self.get_data_dims(), 1))

        self.cropped_data = np.zeros((self.get_data_dims(), self.data_mult), dtype=np.single)
Пример #27
0
def train():
    from util import unpickle
    import os
    import numpy.random as nr

    # Create training directory.
    train_dir = FLAGS.train_dir
    if not tf.gfile.IsDirectory(train_dir):
        tf.logging.info("Creating training directory: %s", train_dir)
        tf.gfile.MakeDirs(train_dir)

    # Create model
    max_iter = 1000000
    config = MLPConfig()
    model = SequenceMLP(config, phase='train')
    model.build()

    loss = model.tot_loss
    # global_step = model.global_stepgg
    train_op = tf.train.AdamOptimizer(learning_rate=5e-4).minimize(loss)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())

    dataset = unpickle('data/rescore_trainval.pkl')
    import pdb
    pdb.set_trace()
    saver = tf.train.Saver(max_to_keep=5)
    num = len(dataset)
    for i in range(max_iter):
        sample_id = nr.randint(0, num)
        datum = dataset[sample_id]
        seq_index, att_mask, label = _process_datum(datum)
        feed_dict = model.fill_feed_dict([seq_index, att_mask, label])
        _, obj = sess.run([train_op, model.tot_loss], feed_dict=feed_dict)

        if i % 100 == 0:
            tf.logging.info('Iteration %d, loss=%0.2f' % (i, obj))

        if i % 5000 == 0:
            saver.save(sess, os.path.join(FLAGS.train_dir,
                                          'model-%d.ckpt' % i))
Пример #28
0
def processTest(test_dir, out_dir, startIdx):
    global NUM_PER_PATCH

    out_file = out_dir + "/imglist"
    trsimgMeta, trsLabels = util.unpickle(out_file)



    allImgMeta = []
    allLabels = []
    subdirnames = os.listdir(test_dir)
    list.sort(subdirnames)
    for classLabel, subdir in enumerate(trsLabels):
        for num,test_sub_dir in enumerate(subdirnames):
            # print classLabel
            # print test_sub_dir
            # print subdir
            if test_sub_dir == subdir:
                imgnames = os.listdir(os.path.join(test_dir, subdir))
                fullnames = [os.path.join(test_dir, subdir, name) for name in imgnames]
                meta = zip(fullnames, [classLabel] * len(fullnames))
                allImgMeta += meta

        # #allLabels.append(subdir)
        # 
        # 
        # # name, label pair
        # #meta = zip(fullnames, [string.atoi(subdir)] * len(fullnames))
        # 
        # 
    print "####### Got %d classes ######" % len(subdirnames)
    print "####### Got %d images ######" % len(allImgMeta)
    print "shuffling..."
    random.shuffle(allImgMeta)
    #return allImgMeta, allLabels


    #allImgMeta, testLabels = collectAndShuffle(test_dir)

    makeBatches(allImgMeta, out_dir, NUM_PER_PATCH, startIdx)
Пример #29
0
    def _active_learning_for_learner_strategy(
            self,
            label: str,
            learner: BaseEstimator,
            sampling_strategy: Callable,
            active_learning_data: ActiveLearningData,
            semi_sup: bool = False) -> Stats:
        data_for_plotting = []
        file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path(
            label, learner, sampling_strategy)

        # used for label propagation
        labeled_indices = []

        if os.path.exists(file_path_pkl):
            if self.verbose:
                print('Available, retrieving...')
            return util.unpickle(file_path_pkl)

        # initialize stats
        stats = self._initialize_stats(label, learner_name,
                                       sampling_strategy_name)

        # initial training
        clf, stats, data_for_plotting = self._active_learning_initial_training(
            semi_sup, stats, data_for_plotting, learner, sampling_strategy,
            active_learning_data, labeled_indices)

        # actively learn one analyst query at a time
        for i in range(self.active_learning_budget):
            stats, data_for_plotting, labeled_indices = self._active_learning_single_query(
                i, semi_sup, clf, sampling_strategy, active_learning_data,
                stats, data_for_plotting, labeled_indices)

        # persist the results
        util.pickle_object(stats, file_path_pkl)
        util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv)

        return stats
Пример #30
0
def build_tree(filename):
    """
    Build the tree of classes from a json specification
    """
    with open(filename, 'rb') as json_file:
        json_data = json.load(json_file)

    meta = unpickle('cifar-100-python/meta')
    tree = _build_tree(json_data, meta['fine_label_names'])[0]

    # Get the number of classes per level
    n_classes = []
    queue = deque(tree.children)
    while queue:
        n_classes.append(len(queue))
        for i in range(len(queue)):
            node = queue.popleft()
            node.index = i
            if node.children:
                for child in node.children:
                    queue.append(child)

    return tree, n_classes
Пример #31
0
def preprocessing():
    folder = r'D:\work\sunxiuyu\SVHN\large-lcn'
    #outfolder = r'D:\work\sunxiuyu\SVHN\svhn-valid-large-1'
    datasize = 32 * 32 * 3
    #meta = util.unpickle(metafile)
    #mean = np.zeros(datasize,np.double)
    num = 0
    begin = 25
    for i in range(begin, begin + 16):
        batch_file = os.path.join(folder, 'data_batch_' + str(i))
        print batch_file
        buffer = util.unpickle(batch_file)
        data = buffer['data']
        dim2 = len(data)
        data = np.transpose(data)
        dim1 = len(data)
        print dim1
        newbuffer = np.zeros((dim1, dim2), np.single)
        for i in range(0, len(data)):
            img1 = data[i].reshape(3, 32, 32)
            img = np.zeros((32, 32, 3), np.single)
            result = np.zeros((3, 32, 32), np.single)
            img[:, :, 0] = img1[0, :, :]
            img[:, :, 1] = img1[1, :, :]
            img[:, :, 2] = img1[2, :, :]
            #cv2.imshow( "img1",img )
            #cv2.waitKey(0)
            result[0, :, :] = img[:, :, 0]
            result[1, :, :] = img[:, :, 1]
            result[2, :, :] = img[:, :, 2]
            #print result[0,:,:]
            newbuffer[i] = result.reshape(3 * 32 * 32)
        newbuffer = np.transpose(newbuffer)
        buffer['data'] = newbuffer
        util.pickle(batch_file, buffer)
    return
Пример #32
0
#

label_names = [ 'airplane', 'automobile', 'bird',	'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck' ]
labels_dict = { i: x for i, x in enumerate( label_names ) }

writer = csv.writer( open( output_file, 'wb' ))
writer.writerow( [ 'id', 'label' ] )
counter = 1

###

# 0th net
# result['labels']
# result['preds']
result = unpickle( sys.argv[1] ) 
num_batches = len( result['labels'] )

for ii in range( num_nets - 1 ):
	result_ii = unpickle( sys.argv[ii+2] )
	# check num of batches is consistant
	num_batches_ii = len( result_ii['labels'] )
	for jj in range( num_batches ):
		# check label is consistant
		assert( np.array_equal( result_ii['labels'][jj], result['labels'][jj] ))
		# nc result['pred'][jj]
		result['preds'][jj] += result_ii['preds'][jj]
		
#print len( result['preds'] )
#pickle( output_file, result )
Пример #33
0
def train_multiclassifier(dataset_path, weights):
    CE_CRITERION = nn.CrossEntropyLoss(weight=weights)

    train_path = os.path.join(DATASET_RAW_PATH, dataset_path, TRAIN_DATA)
    train_data = MovieDataset(train_path)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

    val_path = os.path.join(DATASET_RAW_PATH, dataset_path, VAL_DATA)
    val_data = MovieDataset(val_path)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=BATCH_SIZE,
                                             shuffle=True)
    # Instantiate the models
    MC = MultiClassifier().to(device)
    Optimizer = torch.optim.Adam(MC.parameters(),
                                 lr=ADAM_ALPHA,
                                 betas=ADAM_BETA,
                                 weight_decay=REGULARIZATION)
    print('Training.')
    for val_batch_i, val_batch_data in enumerate(val_loader):
        val_imgs, val_labels, val_indices = val_batch_data
        val_imgs = Variable(val_imgs.type(FloatTensor)).to(device)
        val_labels = torch.stack(val_labels)
        val_labels = torch.transpose(val_labels, 0, 1)
        val_labels = Variable(val_labels).to(device)
        break

    training_loss_data_path = os.path.join(DATASET_RAW_PATH,
                                           "0_0000005" + dataset_path + ".csv")
    training_loss_data = []
    for epoch_index in range(N_EPOCHS):  # loop over the dataset multiple times
        for batch_index, batch_data in enumerate(train_loader):
            # get the inputs
            imgs, labels, indices = batch_data
            imgs = Variable(imgs.type(FloatTensor)).to(device)
            labels = torch.stack(labels)
            labels = torch.transpose(labels, 0, 1)
            labels = Variable(labels).to(device)
            #labels = Variable(labels.to(device))
            # zero the parameter gradients
            Optimizer.zero_grad()

            # forward + backward + optimize
            outputs = MC(imgs)
            loss = CE_CRITERION(outputs, torch.max(labels, 1)[1])
            loss.backward()
            Optimizer.step()
            if (batch_index % 100) == 0:
                val_outputs = MC(val_imgs)
                val_loss = CE_CRITERION(val_outputs,
                                        torch.max(val_labels, 1)[1])
                training_loss_data.append((loss.item(), val_loss.item()))
            # Print Loss
            if epoch_index % PRINT_INTERVAL == 0 and not batch_index:  # print every 2000 mini-batches
                print('Epoch: %d \tTraining Loss: %.3f' % (epoch_index, loss))
                val_outputs = MC(val_imgs)
                val_loss = CE_CRITERION(val_outputs,
                                        torch.max(val_labels, 1)[1])
                print('Epoch: %d \tValidation Loss: %.3f' %
                      (epoch_index, val_loss))
                #    break
    print('Finished Training. Testing on Validation Set.')

    save_training_loss(training_loss_data_path, training_loss_data)
    #Validation Set
    #val_path = os.path.join(DATASET_RAW_PATH, dataset_path, VAL_DATA)
    #val_data = MovieDataset(val_path)
    #val_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
    lenient_predicted = 0
    correct_predicted = 0
    total_predicted = 0
    classes = ut.unpickle(CLASS_INDECES_RAW_PATH)
    movie_dict = ut.unpickle("data/movies_multi_index.pkl")
    with torch.no_grad():
        for data in val_loader:
            imgs, labels, indices = data
            imgs = Variable(imgs.type(FloatTensor)).to(device)
            labels = torch.stack(labels)
            labels = torch.transpose(labels, 0, 1)
            labels = Variable(labels).to(device)
            outputs = MC(imgs)
            outputs = F.softmax(outputs.data, dim=1)
            _, predicted = torch.max(outputs.data, 1)
            #total_predicted += labels.size(0)
            #correct_predicted += (labels[predicted] == 1).sum().item()
            print("MOVIE IDS")
            print(indices)
            for batch_i in range(predicted.size(0)):
                total_predicted += 1
                correct_predicted += (
                    labels[batch_i][predicted[batch_i]].item() == 1)
                lenient_predicted += (predicted[batch_i].item()
                                      in movie_dict[indices[batch_i]])
                print("Movie Id: %s \tPrediction: %s \tGround Truth: %s" %
                      (indices[batch_i], classes[predicted[batch_i]],
                       classes[torch.argmax(labels[batch_i])]))

    print('Accuracy of the Multi-Class Classifier on Validation Set: %d %%' %
          (100 * correct_predicted / total_predicted))
    print(
        'Lenient Accuracy of the Multi-Class Classifier on Validation Set %d %%'
        % (100 * lenient_predicted / total_predicted))

    #classes = ut.unpickle(CLASS_INDECES_RAW_PATH)
    class_correct = list(0. for i in range(len(classes)))
    class_total = list(0. for i in range(len(classes)))
    with torch.no_grad():
        for data in val_loader:
            imgs, labels, indices = data
            imgs = Variable(imgs.type(FloatTensor)).to(device)
            labels = torch.stack(labels)
            labels = torch.transpose(labels, 0, 1)
            labels = Variable(labels).to(device)
            outputs = MC(imgs)
            _, predicted = torch.max(outputs, 1)
            '''
            c = (labels[predicted] == 1).squeeze()
            for i in range(len(labels)):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1
            '''
            for batch_i in range(predicted.size(0)):
                label = np.argmax(labels[batch_i])
                class_total[label] += 1
                class_correct[label] += (
                    labels[batch_i][predicted[batch_i]].item() == 1)

    for i in range(len(classes)):
        if class_total[i] == 0:
            print('Accuracy of %5s : N/A' % (classes[i]))
        else:
            print('Accuracy of %5s : %2d %%' %
                  (classes[i], 100 * class_correct[i] / class_total[i]))
Пример #34
0
from util import plot_array_image

DATA_PATH = '/home/snwiz/data/imagenet12/code/data'
num_class = 1
MEAN_FILE_EXT = "_mean"
if num_class == 1000:
    INPUT_FILE = DATA_PATH + "/imagenet_data"
else:
    INPUT_FILE = DATA_PATH + "/imagenet_data_tiny" + str(int(num_class) ) 

OUTPUT_FILE = INPUT_FILE + MEAN_FILE_EXT
IMAGE_SIZE = 256
VERIFY_RESULT = True
#VERIFY_RESULT = False

out = unpickle( INPUT_FILE );

num = 0;
m = n.zeros( (IMAGE_SIZE, IMAGE_SIZE,3), n.float32 )
if VERIFY_RESULT:
    sum_m  = n.zeros( (IMAGE_SIZE, IMAGE_SIZE,3), n.float32 )

for cls_index in range( num_class ):
    num_cls_index = len(out['index_map_train'][cls_index])
    for index in range( num_cls_index ):
        i = out['index_map'][cls_index][index]
        image_path = out['image_path'][i]
        im = Image.open( image_path )
        #if cls_index == 6 and index == 0:
        #    im.show()
        #    import pdb; pdb.set_trace()
Пример #35
0
#code to tag words based on BILOU
from sklearn.svm import LinearSVC
import util
import nltk.classify
import nltk
from nltk import word_tokenize, sent_tokenize
import parseTrial
from collections import defaultdict
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import jsonrpc
from simplejson import loads

classifier = util.unpickle("trained_tagger.pickle")

server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),
                             jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080)))


def initializeModules():
    ps = PorterStemmer()
    pattern = re.compile("[?!.-;:]+")
    commaPattern = re.compile("[,]+")
    return [ps, pattern, commaPattern]


#Get the tokenized form of the sentence, the postags of the sentence and the parse tree
def getSentenceFeatures(line):
    line = line.strip()
    wordTokenized = word_tokenize(line)
Пример #36
0
#code to tag words based on BILOU
from sklearn.svm import LinearSVC
import util
import nltk.classify
import nltk
from nltk import word_tokenize,sent_tokenize
import parseTrial
from collections import defaultdict
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import jsonrpc
from simplejson import loads

classifier = util.unpickle("trained_tagger.pickle")

server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),
                             jsonrpc.TransportTcpIp(addr=("127.0.0.1", 8080)))


def initializeModules():
	ps = PorterStemmer()
	pattern = re.compile("[?!.-;:]+")
	commaPattern = re.compile("[,]+")
	return [ps,pattern,commaPattern]

#Get the tokenized form of the sentence, the postags of the sentence and the parse tree
def getSentenceFeatures(line):
	line = line.strip()
	wordTokenized = word_tokenize(line)
	posTags = util.runPOS(line)
Пример #37
0
 def load_model(self):
     if self.x_test is None:
         self.load_features()
     self.model = unpickle(self.parameters["pickle_model"])
Пример #38
0
 def __init__(self, data_path):
     dataset = util.unpickle(data_path)
     self.data = dataset['data']
     self.ids = list(self.data.keys())
Пример #39
0
from util import plot_array_image

DATA_PATH = '/home/snwiz/data/imagenet12/code/data'
num_class = 1
MEAN_FILE_EXT = "_mean"
if num_class == 1000:
    INPUT_FILE = DATA_PATH + "/imagenet_data"
else:
    INPUT_FILE = DATA_PATH + "/imagenet_data_tiny" + str(int(num_class))

OUTPUT_FILE = INPUT_FILE + MEAN_FILE_EXT
IMAGE_SIZE = 256
VERIFY_RESULT = True
#VERIFY_RESULT = False

out = unpickle(INPUT_FILE)

num = 0
m = n.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), n.float32)
if VERIFY_RESULT:
    sum_m = n.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), n.float32)

for cls_index in range(num_class):
    num_cls_index = len(out['index_map_train'][cls_index])
    for index in range(num_cls_index):
        i = out['index_map'][cls_index][index]
        image_path = out['image_path'][i]
        im = Image.open(image_path)
        #if cls_index == 6 and index == 0:
        #    im.show()
        #    import pdb; pdb.set_trace()
Пример #40
0
    def __init__(
            self,
            raw_image_queue,  # shared Queue to store raw image
            data,  # data file contians image path information
            mean_file,  # mean file
            root_path,  # root path of images
            data_mode,  # 'all','train','val'
            batch_size=128,  # size of batch
            batch_index=0,  # start batch index
            epoch_index=1  # start epoch index
    ):
        threading.Thread.__init__(self, name="Load Image Thread")
        self.stop = False
        self.sharedata = raw_image_queue
        self.data = data
        self.num_classes = len(self.data['val'])
        self.data_mode = data_mode
        self.root_path = root_path
        if data_mode == "val":
            self.images = self.data['val']
            self.total_samples = self.data['num_data_val']
            self.shuffle = False
            print 'Validation data is not randomized'
        elif data_mode == "train":
            self.images = self.data['train']
            self.total_samples = self.data['num_data_train']
            self.shuffle = False
            #self.shuffle = True
            print 'Traing data shuffle: ', self.shuffle
        else:
            print "data_mode: " + str(data_mode) + " not valid"
            import pdb
            pdb.set_trace()
            sys.exit(1)
        # iterator on classes
        self.iclass = -1
        # iterator for samples of each class
        self.isamples = self.num_classes * [-1]

        # class_iter = range(num_classes)
        # if shuffle: random.shuffle(class_iter)
        # classes_iter = []
        # for i in range(num_classes):
        #    classes_iter.append(range(len(images[i])))
        #    if shuffle: random.shuffle(classes_iter[i])

        # # get batch queue
        # self.batch_queue = []
        # has_add = True
        # while has_add:
        #    has_add = False
        #    for i in range( self.num_classes ):
        #       if len(index_map[i]) > 0:
        #          index = index_map[i].pop()
        #          self.batch_queue.append( index )
        #          has_add = True

        # self.num_images = len( self.batch_queue )

        #init current index and batch size
        self.total_processed = 0
        self.batch_size = batch_size
        self.batch_index = batch_index
        self.epoch_index = epoch_index
        # read data mean from file
        data_mean_file = unpickle(mean_file)
        self.data_mean = data_mean_file['data']
        self.data_mean = self.data_mean.astype(n.float32)
        # store it as uint8
        #self.data_mean = n.round( self.data_mean).astype(n.uint8)
        print data_mode + ': total_samples: ' + str(self.total_samples) \
            + ' batch_size: ' + str(batch_size) \
            + ' num_batches: ' + str(self.get_num_batches())
Пример #41
0
def makeb(name, shape, params=None):
    foo=u.unpickle('/data/t-maoqu/convnets/netsaves/imnet21841aggregated/7.1500')
    #foo=u.unpickle('/home/NORTHAMERICA/t-maoqu/share/net/47.250')
    for i in foo['model_state']['layers']:
        if i['name']==name:
            return i['biases']
Пример #42
0
 def __init__(self, word2vec_file):
     d = unpickle(word2vec_file)
     self._vocab = d['vocab']
     self._word_vectors = d['word_vectors']
     self._create_index_dict(d['word2idx'])
Пример #43
0
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import util
import nltk.classify

featuresSet = util.unpickle("featuresSet_best.txt")
classifier = nltk.classify.SklearnClassifier(LinearSVC())
classifier.train(featuresSet)

util.writePickle(classifier,"trained_tagger.pickle")
Пример #44
0
    def _ensemble(self, label: str) -> List[Stats]:
        active_learning_data = self._active_learning_data_split(label)
        stats = self._initialize_stats(label, 'VotingClassifier',
                                       'entropy_sampling')
        file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path(
            label, VotingClassifier([]), entropy_sampling)
        if os.path.exists(file_path_pkl):
            if self.verbose:
                print('Available, retrieving...')
            return [util.unpickle(file_path_pkl)]

        # supervised
        # active learners
        rf, _, _ = self._active_learning_initial_training(
            False, stats, [], self.active_learning_rf, entropy_sampling,
            active_learning_data, [])
        lr, _, _ = self._active_learning_initial_training(
            False, stats, [], self.active_learning_lr, entropy_sampling,
            active_learning_data, [])
        gb, _, _ = self._active_learning_initial_training(
            False, stats, [], self.active_learning_gb, entropy_sampling,
            active_learning_data, [])

        # semi-supervised: label propagation
        # labeled_indices = []
        # lp, _, _ = self._active_learning_initial_training(True, stats, [], None, entropy_sampling,
        #                                                   active_learning_data, labeled_indices)

        # unsupervised
        prevalence = len(active_learning_data.y_train_start[
            active_learning_data.y_train_start == True]) / len(
                active_learning_data.y_train_start)
        iforest = IsolationForest(contamination=prevalence,
                                  behaviour='new',
                                  n_estimators=self.clf_n_estimator)
        x = pd.concat([
            active_learning_data.x_train_start,
            active_learning_data.x_train_pool
        ]).reset_index(drop=True)
        iforest.fit(x)

        predictions = self._ensemble_predictions(rf, lr, iforest, gb,
                                                 active_learning_data)
        metrics = self._get_metrics(active_learning_data.y_dev, predictions)
        data_for_plotting = [self._get_plotting_row(-1, metrics, 0, 0)]
        metrics = util.add_prefix_to_dict_keys(metrics, 'initial_')
        stats = util.merge_dicts(stats, metrics)

        for i in range(self.active_learning_budget):
            rf, _, _ = self._active_learning_single_query_supervised(
                rf, active_learning_data)
            lr, _, _ = self._active_learning_single_query_supervised(
                lr, active_learning_data)
            gb, _, _ = self._active_learning_single_query_supervised(
                gb, active_learning_data)
            # lp, _, _ = self._active_learning_single_query_semi_sup(lp, labeled_indices, active_learning_data,
            #                                                        entropy_sampling)

            predictions = self._ensemble_predictions(rf, lr, iforest, gb,
                                                     active_learning_data)
            metrics = self._get_metrics(active_learning_data.y_dev,
                                        predictions)

            data_for_plotting.append(self._get_plotting_row(i, metrics, 0, 0))
            if i + 1 in self.active_learning_log_intervals:
                metrics = util.add_prefix_to_dict_keys(metrics,
                                                       f'sample_{i+1}_')
                stats = util.merge_dicts(stats, metrics)

        util.pickle_object(stats, file_path_pkl)
        util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv)

        return [stats]
label_names = [
    'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
    'ship', 'truck'
]
labels_dict = {i: x for i, x in enumerate(label_names)}

writer = csv.writer(open(output_file, 'wb'))
writer.writerow(['id', 'label'])
counter = 1

###

# 0th net
# result['labels']
# result['preds']
result = unpickle(sys.argv[1])
num_batches = len(result['labels'])

for ii in range(num_nets - 1):
    result_ii = unpickle(sys.argv[ii + 2])
    # check num of batches is consistant
    num_batches_ii = len(result_ii['labels'])
    for jj in range(num_batches):
        # check label is consistant
        assert (np.array_equal(result_ii['labels'][jj], result['labels'][jj]))
        # nc result['pred'][jj]
        result['preds'][jj] += result_ii['preds'][jj]

#print len( result['preds'] )
#pickle( output_file, result )
Пример #46
0
def load(datadic, ind, filename):
    datadic[ind] = unpickle(filename)
Пример #47
0
def cifar100toimages():
    file = r'D:\work\sunxiuyu\cifar-100-python\test'
    outfolder = r'D:\work\sunxiuyu\cifar-10-py-colmajor\tmp'
    dict = util.unpickle(file)
    numlcass = np.array(dict['fine_labels'], np.int).max()
    fine_classes = {}
    mean = np.zeros(32 * 32 * 3, dtype=np.double)
    for i in range(0, len(dict['data'])):
        mean += dict['data'][i]
    mean = mean / len(dict['data'])

    metafile = r'D:\work\sunxiuyu\cifar-10-py-colmajor\cifar10\batches.meta'
    outmetafile = r'D:\work\sunxiuyu\cifar-10-py-colmajor\cifar-100\batches.meta'
    meta = util.unpickle(metafile)
    for key in meta:
        print key
    meta['label_names'] = [str(i) for i in range(0, 100)]
    meta['data_mean'] = meta['data_mean'].reshape(3072)
    meta['data_mean'][0:1024] = mean[2048:3072]  #b
    meta['data_mean'][1024:2048] = mean[1024:2048]  #g
    meta['data_mean'][2048:3072] = mean[0:1024]  #r
    util.pickle(outmetafile, meta)
    return

    for i in range(0, len(dict['data'])):
        fine_classes_idx = dict['fine_labels'][i]
        #m_data = dict['data'][i] - mean
        m_data = dict['data'][i]
        if fine_classes_idx in fine_classes:
            fine_classes[fine_classes_idx].append({
                'data':
                m_data,
                'label':
                dict['fine_labels'][i],
                'filename':
                dict['filenames'][i]
            })
        else:
            fine_classes[fine_classes_idx] = [{
                'data': m_data,
                'label': dict['fine_labels'][i],
                'filename': dict['filenames'][i]
            }]
        pass

    # random shuffle
    for i in range(0, len(fine_classes)):
        indexs = range(0, len(fine_classes[i]))
        np.random.shuffle(indexs)
        fine_classes[i] = [fine_classes[i][x] for x in indexs]

    #save image patches
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)
    for i in range(0, len(fine_classes)):
        class_folder = os.path.join(outfolder, str(i))
        if not os.path.exists(class_folder):
            os.makedirs(class_folder)
        for image in fine_classes[i]:
            #save image patch
            m_data = image['data']
            r = m_data[0:1024]
            g = m_data[1024:2048]
            b = m_data[2048:3072]

            r = r.reshape(32, 32)
            g = g.reshape(32, 32)
            b = b.reshape(32, 32)

            color_img = np.zeros((32, 32, 3), dtype=np.uint8)
            color_img[:, :, 0] = b[:, :]
            color_img[:, :, 1] = g[:, :]
            color_img[:, :, 2] = r[:, :]

            imagefile = os.path.join(class_folder, image['filename'])
            cv2.imwrite(imagefile, color_img)
Пример #48
0
    def _learn_anomalies(self, label: str) -> List[Stats]:
        data_for_plotting = []
        learner = self.active_learning_rf

        # TERRIBLE hack to pass the name of the sampling strategy
        def iforest_sampling(contamination):
            return IsolationForest(contamination=contamination,
                                   n_estimators=self.clf_n_estimator,
                                   behaviour='new',
                                   n_jobs=-1,
                                   random_state=self.random_seed)

        file_path_pkl, file_path_csv, learner_name, sampling_strategy_name = self._get_output_path(
            label, learner, iforest_sampling)

        if os.path.exists(file_path_pkl):
            if self.verbose:
                print('Available, retrieving...')
            return util.unpickle(file_path_pkl)

        active_learning_data = self._active_learning_data_split(label)
        x_dev = active_learning_data.x_dev
        x_train_start = active_learning_data.x_train_start
        y_train_start = active_learning_data.y_train_start

        # initialize stats
        stats = self._initialize_stats(label, learner_name,
                                       sampling_strategy_name)

        def update_stats(stats_, data_for_plotting_, prefix, x_, y_, i_):
            learner.fit(x_, y_)
            predicted = learner.predict(x_dev)
            scores = learner.predict_proba(x_dev)[:, 1]
            metrics = self._get_metrics(actual=active_learning_data.y_dev,
                                        predicted=predicted,
                                        scores=scores)
            data_for_plotting_.append(
                self._get_plotting_row(i_,
                                       metrics,
                                       elapsed_train=0,
                                       elapsed_query=0))
            metrics = util.add_prefix_to_dict_keys(metrics, prefix)
            stats_ = util.merge_dicts(stats_, metrics)
            return stats_, data_for_plotting_

        # initial training
        stats, data_for_plotting = update_stats(stats, data_for_plotting,
                                                'initial_', x_train_start,
                                                y_train_start, -1)

        # isolation forest
        y_train_start = active_learning_data.y_train_start
        prevalence = len(
            y_train_start[y_train_start == True]) / len(y_train_start)
        iforest = iforest_sampling(prevalence)
        iforest.fit(
            pd.concat([
                active_learning_data.x_train_start,
                active_learning_data.x_train_pool
            ]))
        anomaly_indices_sorted = np.argsort(
            iforest.score_samples(active_learning_data.x_train_pool))

        # pretend that we're active learning
        # however we're just going through a sorted list of anomalies (starting with the most anomalous)
        for i in range(self.active_learning_budget):
            x_extra = active_learning_data.x_train_pool.iloc[
                anomaly_indices_sorted[:i + 1]]
            y_extra = active_learning_data.y_train_pool.iloc[
                anomaly_indices_sorted[:i + 1]]
            x = pd.concat([active_learning_data.x_train_start, x_extra])
            y = pd.concat([active_learning_data.y_train_start, y_extra])
            stats, data_for_plotting = update_stats(stats, data_for_plotting,
                                                    f'sample_{i+1}_', x, y, i)

        # persist the results
        util.pickle_object(stats, file_path_pkl)
        util.write_as_csv(pd.DataFrame(data_for_plotting), file_path_csv)

        return [stats]
Пример #49
0
def send(players, recipient='all'):
    pub_socket.send(recipient + ':' + util.pickle(players))

player_list = []
player_details = {}

while True:
    
    time.sleep(0.016)

    send(player_details)

    try:
        msg = pull_socket.recv_pyobj(flags=zmq.core.NOBLOCK)
    except zmq.core.error.ZMQError:
        pass
    else:
        if msg['cmd'] == 'new':
            player = util.unpickle(msg['player'])
            print 'new player %s' % player.name
            player_details[player.name] = player
        elif msg['cmd'] == 'update':
            player = util.unpickle(msg['player'])
            del player_details[player.name]
            player_details[player.name] = player
        elif msg['cmd'] == 'update_stop':
            player = util.unpickle(msg['player'])
            del player_details[player.name]
            player_details[player.name] = player

Пример #50
0
 def load(self, batch_idx):
     print "loading batch_idx: ", batch_idx, " batch: ", self.batch_range[batch_idx]
     self.datadic[batch_idx] = unpickle(self.data_dir + "data_batch_" + str(self.batch_range[batch_idx]))
Пример #51
0
def main():
    """ Main function """

    # Parameters
    train_data_size = 40000
    batch_size = 64
    epoch = 30
    step_size = train_data_size // batch_size
    display_size = step_size // 2
    learning_rate = 0.001

    train_filename = 'cifar-100-python/train'
    test_filename = 'cifar-100-python/test'
    text_label_filename = 'cifar-100-python/meta'

    # 1. Read training and test data.
    print("Reading training and test data files...")
    train_data = util.unpickle(train_filename)
    test_data = util.unpickle(test_filename)
    text_labels = util.unpickle(text_label_filename)

    raw_train_images, raw_train_label = util.split_images_and_labels(train_data)
    raw_test_images, test_labels = util.split_images_and_labels(test_data)
    fine_label_names, coarse_label_names = util.split_labels(text_labels)

    # Construct fine class to superclass mapping
    superclass_mapping = util.construct_superclass_mapping(
        train_data[b'fine_labels'],
        train_data[b'coarse_labels'])
    fine_labels = util.decode_binary_text(fine_label_names)
    coarse_labels = util.decode_binary_text(coarse_label_names)


    # 2. Pre-process the data
    # Calculate mean image using train data and subtract all images from it
    print("Pre-processing the data...")
    # Cast to float type first TODO and maybe normalize the data
    raw_train_images_float = raw_train_images.astype(float)
    raw_test_images_float = raw_test_images.astype(float)

    mean_image = raw_train_images[:train_data_size].sum(axis=0) / train_data_size
    raw_train_images_float -= mean_image
    raw_test_images_float -= mean_image

    # Format that we want (-1, 3, 32, 32)
    formatted_train_images = util.format_data(raw_train_images_float)
    test_images = util.format_data(raw_test_images_float)

    # 3. Split the train images and labels into train and validation set
    train_images, train_labels, vali_images, vali_labels = util.split_train_and_validation(
        formatted_train_images, raw_train_label, train_data_size)
    vali_super_labels = np.array(train_data[b'coarse_labels'][train_data_size:])
    test_super_labels = np.array(test_data[b'coarse_labels'])

    # 4. Construct the graph
    print("Constructing the graph...")
    # Inputs
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y_real = tf.placeholder(tf.int64, shape=(None,))

    augmentation = tf.map_fn(tf.image.random_flip_up_down, x)

    # Outputs, cross entropy calculation, and optimizer
    y_predict = util.lenet_5(x)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=y_predict, labels=y_real)
    loss_op = tf.reduce_mean(cross_entropy)
    train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss_op)

    # 5. Calculate confusion matrix and accuracy (batch and validation)
    # Fine class label prediction: confusion matrix, correct predictions, and accuracy
    labels_predicted = tf.argmax(y_predict, 1)
    confusion_matrix_fine = tf.confusion_matrix(y_real, labels_predicted)
    correct_prediction = tf.equal(labels_predicted, y_real)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    correct_top_5 = tf.nn.in_top_k(y_predict, y_real, 5)
    accuracy_fine_top_5 = tf.reduce_mean(tf.cast(correct_top_5, tf.float32))

    # Super class label prediction: confusion matrix, correct predictions, and accuracy
    mapped_labels = tf.placeholder(tf.int64, shape=(None,))
    confusion_matrix_super = tf.confusion_matrix(y_real, mapped_labels)
    correct_prediction_super = tf.equal(mapped_labels, y_real)
    accuracy_super = tf.reduce_mean(tf.cast(correct_prediction_super, tf.float32))

    top_5_labels = tf.nn.top_k(y_predict, 5)

    # Add results to summaries
    loss_summary = tf.summary.scalar('Loss', loss_op)
    accuracy_summary = tf.summary.scalar('Accuracy: Fine Labels', accuracy)
    accuracy_summary_super = tf.summary.scalar('Accuracy: Super Labels', accuracy_super)

    # 6. Start the training
    print("Start training...")
    total_steps = 0     # count the number of steps it takes throughout training

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        train_writer = tf.summary.FileWriter('logs' + '/train', sess.graph)
        vali_writer = tf.summary.FileWriter('logs' + '/validation', sess.graph)

        for epoch_count in range(1, epoch + 1):
            for step_count in range(1, step_size + 1):
                total_steps += 1

                # Input training data
                train_batch, label_batch = util.get_random_batch(train_images, train_labels, batch_size)
                #train_batch = sess.run(augmentation, feed_dict={x: train_batch})

                sess.run(train_op, feed_dict={x: train_batch, y_real: label_batch})

                train_loss_summary, train_acc_summary, loss, acc = sess.run(
                    [loss_summary, accuracy_summary, loss_op, accuracy],
                    feed_dict={x: train_batch, y_real: label_batch})

                train_writer.add_summary(train_loss_summary, total_steps)
                train_writer.add_summary(train_acc_summary, total_steps)

                # Print validation accuracy every so often
                if step_count % display_size == 0:
                    # Validation: fine class label accuracy measurement
                    vali_acc_fine = sess.run(accuracy, feed_dict={x: vali_images, y_real: vali_labels})
                    vali_acc_summary_fine = sess.run(accuracy_summary, feed_dict={x: vali_images, y_real: vali_labels})
                    vali_writer.add_summary(vali_acc_summary_fine, total_steps)
                    print('Validation fine label accuracy: {:.5f}'.format(vali_acc_fine))

                    # Validation: super class label accuracy measurement
                    labels = sess.run(labels_predicted, feed_dict={x: vali_images, y_real: vali_super_labels})
                    new_labels = util.map_class(labels, superclass_mapping)

                    vali_acc_super = sess.run(accuracy_super,
                        feed_dict={x: vali_images, y_real: vali_super_labels, mapped_labels: new_labels})
                    vali_acc_summary_super = sess.run(accuracy_summary_super,
                        feed_dict={x: vali_images, y_real: vali_super_labels, mapped_labels: new_labels})
                    vali_writer.add_summary(vali_acc_summary_super, total_steps)
                    print('Number of steps: {}'.format(total_steps))
                    print('Validation super label accuracy: {:.5f}'.format(vali_acc_super))
            print("{} epochs finished".format(epoch_count))

        # Test: Fine class label accuracy measurement and confusion matrix
        test_acc_fine_top_1 = sess.run(accuracy, feed_dict={x: test_images, y_real: test_labels})
        test_acc_fine_top_5 = sess.run(accuracy_fine_top_5, feed_dict={x: test_images, y_real: test_labels})

        con_matrix_fine = sess.run(confusion_matrix_fine, feed_dict={x: test_images, y_real: test_labels})

        # Display the first ten images
        first_ten_predictions = sess.run(labels_predicted, feed_dict={x: test_images, y_real: test_labels})[:10]
        prediction_text_labels = util.map_text_labels(first_ten_predictions, fine_labels)
        true_text_labels = util.map_text_labels(test_labels[:10], fine_labels)
        print(prediction_text_labels)
        print(true_text_labels)

        first_ten_images = util.format_data(raw_test_images[:10])
        display_image = util.combine_ten_images(first_ten_images)
        plt.imshow(display_image)
        plt.savefig("Result", bbox_inches='tight')

        # Test: Super class label accuracy measurement
        labels = sess.run(labels_predicted, feed_dict={x: test_images, y_real: test_super_labels})
        new_labels = util.map_class(labels, superclass_mapping)
        test_acc_super = sess.run(accuracy_super,
            feed_dict={x: test_images, y_real: test_super_labels, mapped_labels: new_labels})
        con_matrix_super = sess.run(confusion_matrix_super,
            feed_dict={x: test_images, y_real: test_super_labels, mapped_labels: new_labels})

        top_5_labels = sess.run(top_5_labels, feed_dict={x: test_images})[1]
        util.map_all_classes(top_5_labels, superclass_mapping)
        correctness_test_top_5 = util.correct_in_top_5_super(top_5_labels, test_super_labels)
        test_acc_top_5_super = sum(correctness_test_top_5) / len(correctness_test_top_5)

        # Save our result
        output_result(
            [
                'Number of steps taken: {}\n'.format(total_steps),
                'Test fine label accuracy (top 1): {:.5f}\n'.format(test_acc_fine_top_1),
                'Test fine label accuracy (top 5): {:.5f}\n'.format(test_acc_fine_top_5),
                'Test super label accuracy (top 1): {:.5f}\n'.format(test_acc_super),
                'Test super label accuracy (top 5): {:.5f}\n'.format(test_acc_top_5_super)
            ]
        )
        save_confusion_matrix(con_matrix_fine, 'Fine-Label-Confusion-Matrix.txt')
        save_confusion_matrix(con_matrix_super, 'Super-Label-Confusion-Matrix.txt')
        save_heatmap(con_matrix_fine, 'Heatmap-fine-label')
        save_heatmap(con_matrix_super, 'Heatmap-super-label')

        print("Training finished!")
Пример #52
0
    def __init__( self,
                  raw_image_queue,  # shared Queue to store raw image
                  data,             # data file contians image path information
                  mean_file,        # mean file
                  root_path,        # root path of images
                  data_mode,        # 'all','train','val'
                  batch_size = 128, # size of batch
                  batch_index = 0,  # start batch index
                  epoch_index = 1   # start epoch index
                  ):
        threading.Thread.__init__( self, name = "Load Image Thread" )
        self.stop = False
        self.sharedata = raw_image_queue
        self.data = data
        self.num_classes = len(self.data['val'])
        self.data_mode = data_mode
        self.root_path = root_path
        if data_mode == "val":
           self.images = self.data['val']
           self.total_samples = self.data['num_data_val']
           self.shuffle = False
           print 'Validation data is not randomized'
        elif data_mode == "train":
           self.images = self.data['train']
           self.total_samples = self.data['num_data_train']
           self.shuffle = False 
           #self.shuffle = True
           print 'Traing data shuffle: ', self.shuffle
        else:
           print "data_mode: " + str(data_mode) + " not valid"
           import pdb; pdb.set_trace()
           sys.exit(1)
        # iterator on classes
        self.iclass = -1
        # iterator for samples of each class
        self.isamples = self.num_classes * [-1]

        # class_iter = range(num_classes)
        # if shuffle: random.shuffle(class_iter)
        # classes_iter = []
        # for i in range(num_classes):
        #    classes_iter.append(range(len(images[i])))
        #    if shuffle: random.shuffle(classes_iter[i])

        # # get batch queue
        # self.batch_queue = []
        # has_add = True
        # while has_add:
        #    has_add = False
        #    for i in range( self.num_classes ):
        #       if len(index_map[i]) > 0:
        #          index = index_map[i].pop()
        #          self.batch_queue.append( index )
        #          has_add = True

        # self.num_images = len( self.batch_queue )

        #init current index and batch size
        self.total_processed = 0
        self.batch_size = batch_size
        self.batch_index = batch_index
        self.epoch_index = epoch_index
        # read data mean from file
        data_mean_file = unpickle(mean_file)
        self.data_mean = data_mean_file['data']
        self.data_mean = self.data_mean.astype(n.float32)
        # store it as uint8
        #self.data_mean = n.round( self.data_mean).astype(n.uint8)
        print data_mode + ': total_samples: ' + str(self.total_samples) \
            + ' batch_size: ' + str(batch_size) \
            + ' num_batches: ' + str(self.get_num_batches())
Пример #53
0
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import util
import nltk.classify

featuresSet = util.unpickle("featuresSet_best.txt")
classifier = nltk.classify.SklearnClassifier(LinearSVC())
classifier.train(featuresSet)

util.writePickle(classifier, "trained_tagger.pickle")
Пример #54
0
    'save_dir',
    '/work/cse496dl/cpack/Assignment_3/models/2/maxcompression_encoder_homework_3-0',
    'directory where VAE model graph and weights are saved')
flags.DEFINE_integer('batch_size', 250, '')
flags.DEFINE_integer('latent_size', 32, '')
flags.DEFINE_integer('max_epoch', 100, '')
flags.DEFINE_integer('early_stop', 15, '')
FLAGS = flags.FLAGS

#############
# CIFAR 100 #
#############
cifar100_test = {}
cifar100_train = {}
# Load the raw CIFAR-100 data.
cifar100_test = util.unpickle(FLAGS.data_dir + 'cifar-100-python/test')
cifar100_train = util.unpickle(FLAGS.data_dir + 'cifar-100-python/train')

train_data = cifar100_train[b'data']
test_data = cifar100_test[b'data']

train_data = np.reshape(train_data,
                        (50000, 3, 32, 32)).transpose(0, 2, 3, 1).astype(float)
test_data = np.reshape(test_data,
                       (10000, 3, 32, 32)).transpose(0, 2, 3, 1).astype(float)

################
# VAE Modeling #
################
img_shape = [32, 32, 3]
tf.reset_default_graph()