def main():
    if len(sys.argv) < 4:
        print_usage()

    dataset = sys.argv[1]
    base_dir = sys.argv[2]
    out_file = sys.argv[3]
    phase = sys.argv[4]
    names = util.load_names(dataset, phase)
    lbls = util.load_labels(dataset, phase)
    centers = []

    for idx, strin in enumerate(lbls):
        # load label data
        joints = np.asarray(np.reshape(strin.split(), (21, 3)),
                            dtype=np.float32)
        # convert label data from world coordinates to pixel locations
        joints, skel_camcoords = util.world2pixel(joints, dataset)
        # calculate centers
        c = util.get_center_fpad(joints)
        c = np.asarray(c, dtype=np.float32)
        centers.append(c.reshape((1, 3)))
        if idx % 500 == 0:
            print('{}/{}'.format(idx + 1, len(names)))

    util.save_results(centers, out_file)
예제 #2
0
def compute_validation_predictions(model_id, validation_set):
	d = importlib.import_module("nets.net_" + model_id)
	model, X, y = d.define_net()

	model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights")

	# Lower batch size since TTA multiplies batch size by 16
	params.BATCH_SIZE = 32

	io = ImageIO()
	mean, std = io.load_mean_std()

    # Read training labels for the keys
	y = util.load_labels()
	keys = y.index.values

	model.batch_iterator_predict = TTABatchIterator(keys, params.BATCH_SIZE, std, mean, cv = True)
	print "TTAs per image: %i, augmented batch size: %i" % (model.batch_iterator_predict.ttas, model.batch_iterator_predict.ttas * params.BATCH_SIZE)

	padded_batches = ceil(validation_set.shape[0]/float(params.BATCH_SIZE))

	pred = model.predict_proba(validation_set)
	pred = pred.reshape(padded_batches, model.batch_iterator_predict.ttas, params.BATCH_SIZE)
	pred = np.mean(pred, axis = 1)
	pred = pred.reshape(padded_batches * params.BATCH_SIZE)

	# Remove padded lines
	pred = pred[:validation_set.shape[0]]

	return pred
예제 #3
0
 def get_labels(self):
     """
     get_labels()
     Read labels from memcache stored by the 'post' view,
     Returns:
         An array contains sorted labels.
     """
     return util.load_labels(self.request, self.response)
예제 #4
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    for X in [X1, X2]:
        NUM_RUNS = 50
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(len(X))#range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = samples#range(len(X))#samples 
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]
        N_ESTIMATORS = 20
        avg_mat = None 

        for run in range(NUM_RUNS): 
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=20, oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i,j)] = similarity.get((i,j), 0) + 1

            mat = np.array([[(1.0 - similarity.get((i,j), 0)/N_ESTIMATORS)**2 for j in samples] for i in samples])
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)  
        avg_mat = avg_mat / NUM_RUNS
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix, color_threshold=0.8, labels=y, show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
def profile(subset=1000, multi=True, n_threads = 4, batch_size=64, thread_pool=False):

    # Load a bunch of imagenames
    y = util.load_labels()
    y = y[:subset]
    keys = y.index.values

    #Create sublists (batches)
    batched_keys = util.chunks(keys, batch_size)

    if multi:
        augment_multithreaded(batched_keys, n_threads=n_threads, thread_pool=thread_pool)
    else:
        augment_singlethreaded(batched_keys)
예제 #6
0
def profile(subset=1000,
            multi=True,
            n_threads=4,
            batch_size=64,
            thread_pool=False):

    # Load a bunch of imagenames
    y = util.load_labels()
    y = y[:subset]
    keys = y.index.values

    #Create sublists (batches)
    batched_keys = util.chunks(keys, batch_size)

    if multi:
        augment_multithreaded(batched_keys,
                              n_threads=n_threads,
                              thread_pool=thread_pool)
    else:
        augment_singlethreaded(batched_keys)
예제 #7
0
def get_dataloader(sts, labels=None, keys=['obs1', 'obs2', 'hyp1', 'hyp2'], \
                   batch_size=64, num_buckets=10, bucket_ratio=.5, \
                   ctx=mx.gpu(), max_seq_length=25, sample_num=None):
    '''
    this function will use the helpers above, take sentence file path,
    label file path, and batch_size, num_buckets, bucket_ratio, to
    get the dataloader for model to us. sample_num controls how many
    samples in dataset the model will use, defualt to None, e.g., use all
    '''
    if labels:
        sentences = load_sentences(sts, keys=keys)
        sentences = sentences[:sample_num]
        labels = load_labels(labels)[:sample_num]
        
        try:
            assert(len(sentences)==len(labels))
        except:
            logger.error('Sample sentence length does not equal to label\'s length!')
            exit(-1)

        dataset = to_dataset(sentences, labels, ctx=ctx, batch_size=batch_size, \
                             max_seq_length=max_seq_length)

        dataloader = to_dataloader(dataset=dataset, batch_size=batch_size, \
                                   num_buckets=num_buckets, bucket_ratio=bucket_ratio)
    else:
        dataset = to_dataset(sts, labels, ctx=ctx, batch_size=batch_size, \
                             max_seq_length=max_seq_length)
        dataloader = []
        for sample in dataset:
            batch = []
            for emb in sample:
                batch.append(nd.array(emb.reshape(1, *emb.shape)))
            dataloader.append(batch)

    return dataloader
예제 #8
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))
    feats, files = None,None
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], [] 
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]] 
    X1 = [mfcc[item[0]] for item in metadata[1:]] 
    Y = util.load_labels((dataset + "_metadata") if dataset else None)#"bbsmd.csv")

    for X in [X1, X2] if full_dataset else [X1,]:
        print("------")
       
        classifiers = [ RandomForestClassifier(n_estimators=50, max_features=15, oob_score=True),
            KNeighborsClassifier(3),
            svm.SVC(kernel='linear', C=1),
            svm.SVC(gamma=2, C=1),
            GaussianNB()
        ]
        for clf in classifiers:
            scores = cross_val_score(clf, X, Y, cv=5)
            score = sum(scores)/len(scores)
            print(type(clf).__name__, "\t", score)
예제 #9
0
import image_loops

##################################
# This script creates hdf5 files for RGB-D data since different processing is required
# Run for both phase='test' and phase='train' to get both datafiles
# colormap and depth2cords cython functions are used here
##################################

DIR = "../labels/"
phase = 'train'
h5_fn = os.path.join(DIR, ('rgbd_' + phase + '_data_.h5'))

base_dir = '/home/bilbeisi/REN/cropped/'

names = util.load_names('fpad', phase)
labels = util.load_labels('fpad', phase)
cnames = util.load_names('fpac', phase)
centers = util.load_centers('fpad', phase).astype(float)

imgs = np.zeros((len(names), 4, 96, 96), dtype=np.float32)
lbls = np.zeros((len(labels), 63), dtype=np.float)

cube_size = 150  # cube size in mm for cropping

for idx, name in enumerate(names):
    cname = cnames[idx]
    img = util.load_image('fpad', os.path.join('/home/bilbeisi/REN/', name))
    img[img == 0] = 1

    cimg = util.load_image('fpac', os.path.join('/home/bilbeisi/REN/', cname))
    cimg = cimg.astype(float)
예제 #10
0
    cv_folds = args.cv_folds
    cv_lno = args.cv_lno
    n_jobs = args.n_jobs

    if calibrate is None:
        calibrate = False
    else:
        calibrate = bool(calibrate)
        print(calibrate)

    if n_jobs is not None:
        n_jobs = int(n_jobs)

    # load filenames and labels
    sample_images = util.load_sample_images(out_dir)
    samples, cats, labels = util.load_labels(out_dir)

    if sample_weight is not None:
        # get labels for sample_weight category
        c = np.where(cats == sample_weight)[0][0]
        ln = np.unique([l[c] for l in labels])
        ln.sort()
        ln = list(ln)
        if '' in ln:
            del ln[ln.index('')]
        label_names_sw = ln
        labels_sw = np.array(
            [ln.index(l) if l in ln else -1 for l in labels[:, c]])
    if group is not None:
        # get labels for group category
        if group == sample_weight:
예제 #11
0
def tf_classify():
    # TODO: python -m scripts.label_image     --graph=tf_files/retrained_graph.pb      --image=test/aurelia.jpeg
    import socket

    print("In tf_classify handler from {}".format(socket.getfqdn()))

    file_name = "models/mobilenet/example/3475870145_685a19116d.jpg"
    file_name = "https://www.eopugetsound.org/sites/default/files/styles/magazinewidth_592px/public/topical_article/images/moon_jellyfish.jpg?itok=Esreg6zX"

    # Get payload
    payload = request.get_json(silent=True, force=True)

    if payload == None:
        if request.get_data() != None:
            payload = json.loads(request.get_data())

    if payload != None:
        if payload.get("nlp").get("entities").get("url"):
            file_name = payload.get("nlp").get("entities").get("url")[0].get(
                "raw")

            # Load model file
            model_file = "models/mobilenet/retrained_graph.pb"
            label_file = "models/mobilenet/retrained_labels.txt"
            input_height = 224
            input_width = 224
            input_mean = 128
            input_std = 128
            input_layer = "input"
            output_layer = "final_result"

            graph = util.load_graph(model_file)
            t = util.read_tensor_from_image_file(file_name,
                                                 input_height=input_height,
                                                 input_width=input_width,
                                                 input_mean=input_mean,
                                                 input_std=input_std)

            input_name = "import/" + input_layer
            output_name = "import/" + output_layer
            input_operation = graph.get_operation_by_name(input_name)
            output_operation = graph.get_operation_by_name(output_name)

            with tf.Session(graph=graph) as sess:
                start = time.time()
                results = sess.run(output_operation.outputs[0],
                                   {input_operation.outputs[0]: t})
                end = time.time()

            results = np.squeeze(results)

            top_k = results.argsort()[-5:][::-1]
            labels = util.load_labels(label_file)

            print('\nEvaluation time (1-image): {:.3f}s\n'.format(end - start))
            template = "{} (score={:0.5f})"

            print(top_k)

            for i in top_k:
                print(template.format(labels[i], results[i]))

            # I really don't know, my best guess is []

            if results[0] < 0.1:
                response = "I really don't know, my best guess is that this looks like a " + labels[
                    top_k[0]]
            else:
                response = 'I think this is a ' + labels[top_k[0]]

            response = 'I think this is a ' + labels[top_k[0]]

            return jsonify(
                status=200,
                replies=[{
                    'type': 'text',
                    'content': response
                }],
                conversation={'memory': {
                    'plankton': labels[top_k[0]]
                }})
def predict(model_id, raw, validation, train, n_eyes, average_over_eyes):
    params.DISABLE_CUDNN = True
    params.MULTIPROCESS = False

    d = importlib.import_module("nets.net_" + model_id)
    model, X, y = d.define_net()
    model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights")

    f = get_iter_func(model)

    # Decrease batch size because TTA increases it 16-fold
    # Uses too much memory otherwise
    params.BATCH_SIZE = 8

    io = ImageIO()
    mean, std = io.load_mean_std()

    if validation or train:
        y = util.load_labels()
    else:
        y = util.load_sample_submission()

    keys = y.index.values

    tta_bi = TTABatchIterator(keys,
                              params.BATCH_SIZE,
                              std,
                              mean,
                              cv=validation or train,
                              n_eyes=n_eyes)
    print "TTAs per image: %i, augmented batch size: %i" % (
        tta_bi.ttas, tta_bi.ttas * params.BATCH_SIZE * n_eyes)

    if validation:
        X_test = np.load(params.IMAGE_SOURCE + "/X_valid.npy")
    elif train:
        X_test = np.load(params.IMAGE_SOURCE + "/X_train.npy")
    else:
        X_test = np.arange(y.shape[0])

    padded_batches = ceil(X_test.shape[0] / float(params.BATCH_SIZE))

    pred = get_activations(X_test, tta_bi, f)

    concat_preds = []

    for batch_pred in pred:
        hidden = batch_pred[0]
        output = batch_pred[1]

        concat = np.concatenate([output, hidden], axis=1)

        #if average_over_eyes:
        #means = concat.reshape(concat.shape[0] / 2, 2, concat.shape[1])
        #means = means.mean(axis = 1)
        #means = np.repeat(means, 2, axis = 0)

        concat_preds.append(concat)

    pred = np.vstack(concat_preds)
    output_units = pred.shape[1]

    #pred = model.predict_proba(X_test)
    pred = pred.reshape(padded_batches, tta_bi.ttas, params.BATCH_SIZE,
                        output_units)
    pred = np.mean(pred, axis=1)
    pred = pred.reshape(padded_batches * params.BATCH_SIZE, output_units)

    # Remove padded lines
    pred = pred[:X_test.shape[0]]

    # Save unrounded
    #y.loc[keys] = pred

    if validation:
        filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_validation.npy"
    elif train:
        filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_train.npy"
    else:
        filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_test.npy"

    np.save(filename, pred)
    #y.to_csv(filename)
    print "Saved raw predictions to " + filename

    if not raw and not validation and not train:
        W = np.load(params.SAVE_URL + "/" + model_id +
                    "/optimal_thresholds.npy")

        pred = weighted_round(pred, W)

        pred = pred[:, np.newaxis]  # add axis for pd compatability

        hist, _ = np.histogram(pred, bins=5)
        print "Distribution over class predictions on test set: ", hist / float(
            y.shape[0])

        y.loc[keys] = pred

        y.to_csv(params.SAVE_URL + "/" + model_id + "/submission.csv")

        print "Gzipping..."

        if not params.ON_COMA:
            call("gzip -c " + params.SAVE_URL + "/" + model_id +
                 "/submission.csv > " + params.SAVE_URL + "/" + model_id +
                 "/submission.csv.gz",
                 shell=True)

        print "Done! File saved to models/" + model_id + "/submission.csv"
def define_net():
    define_net_specific_parameters()
    io = ImageIO()

    # Read pandas csv labels
    y = util.load_labels()

    if params.SUBSET is not 0:
        y = y[:params.SUBSET]

    X = np.arange(y.shape[0])

    mean, std = io.load_mean_std(circularized=params.CIRCULARIZED_MEAN_STD)
    keys = y.index.values

    if params.AUGMENT:
        train_iterator = AugmentingParallelBatchIterator(keys,
                                                         params.BATCH_SIZE,
                                                         std,
                                                         mean,
                                                         y_all=y)
    else:
        train_iterator = ParallelBatchIterator(keys,
                                               params.BATCH_SIZE,
                                               std,
                                               mean,
                                               y_all=y)

    test_iterator = ParallelBatchIterator(keys,
                                          params.BATCH_SIZE,
                                          std,
                                          mean,
                                          y_all=y)

    if params.REGRESSION:
        y = util.float32(y)
        y = y[:, np.newaxis]

    if 'gpu' in theano.config.device:
        # Half of coma does not support cuDNN, check whether we can use it on this node
        # If not, use cuda_convnet bindings
        from theano.sandbox.cuda.dnn import dnn_available
        if dnn_available() and not params.DISABLE_CUDNN:
            from lasagne.layers import dnn
            Conv2DLayer = dnn.Conv2DDNNLayer
            MaxPool2DLayer = dnn.MaxPool2DDNNLayer
        else:
            from lasagne.layers import cuda_convnet
            Conv2DLayer = cuda_convnet.Conv2DCCLayer
            MaxPool2DLayer = cuda_convnet.MaxPool2DCCLayer
    else:
        Conv2DLayer = layers.Conv2DLayer
        MaxPool2DLayer = layers.MaxPool2DLayer

    Maxout = layers.pool.FeaturePoolLayer

    net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv0', Conv2DLayer),
            ('pool0', MaxPool2DLayer),
            ('conv1', Conv2DLayer),
            ('pool1', MaxPool2DLayer),
            ('conv2', Conv2DLayer),
            ('pool2', MaxPool2DLayer),
            ('conv3', Conv2DLayer),
            ('pool3', MaxPool2DLayer),
            ('conv4', Conv2DLayer),
            ('pool4', MaxPool2DLayer),
            ('dropouthidden1', layers.DropoutLayer),
            ('hidden1', layers.DenseLayer),
            ('maxout1', Maxout),
            ('dropouthidden2', layers.DropoutLayer),
            ('hidden2', layers.DenseLayer),
            ('maxout2', Maxout),
            ('dropouthidden3', layers.DropoutLayer),
            ('output', layers.DenseLayer),
        ],
        input_shape=(None, params.CHANNELS, params.PIXELS, params.PIXELS),
        conv0_num_filters=32,
        conv0_filter_size=(5, 5),
        conv0_stride=(2, 2),
        pool0_pool_size=(2, 2),
        pool0_stride=(2, 2),
        conv1_num_filters=64,
        conv1_filter_size=(3, 3),
        conv1_border_mode='same',
        pool1_pool_size=(2, 2),
        pool1_stride=(2, 2),
        conv2_num_filters=128,
        conv2_filter_size=(3, 3),
        conv2_border_mode='same',
        pool2_pool_size=(2, 2),
        pool2_stride=(2, 2),
        conv3_num_filters=192,
        conv3_filter_size=(3, 3),
        conv3_border_mode='same',
        pool3_pool_size=(2, 2),
        pool3_stride=(2, 2),
        conv4_num_filters=256,
        conv4_filter_size=(3, 3),
        conv4_border_mode='same',
        pool4_pool_size=(2, 2),
        pool4_stride=(2, 2),
        hidden1_num_units=1024,
        hidden2_num_units=1024,
        dropouthidden1_p=0.5,
        dropouthidden2_p=0.5,
        dropouthidden3_p=0.5,
        maxout1_pool_size=2,
        maxout2_pool_size=2,
        output_num_units=1 if params.REGRESSION else 5,
        output_nonlinearity=None
        if params.REGRESSION else nonlinearities.softmax,
        update_learning_rate=theano.shared(
            util.float32(params.START_LEARNING_RATE)),
        update_momentum=theano.shared(util.float32(params.MOMENTUM)),
        custom_score=('kappa', quadratic_kappa),
        regression=params.REGRESSION,
        batch_iterator_train=train_iterator,
        batch_iterator_test=test_iterator,
        on_epoch_finished=[
            AdjustVariable('update_learning_rate',
                           start=params.START_LEARNING_RATE),
            stats.Stat(),
            ModelSaver()
        ],
        max_epochs=500,
        verbose=1,

        # Only relevant when create_validation_split = True
        eval_size=0.1,

        # Need to specify splits manually like indicated below!
        create_validation_split=params.SUBSET > 0,
    )

    # It is recommended to use the same training/validation split every model for ensembling and threshold optimization
    #
    # To set specific training/validation split:
    net.X_train = np.load(params.IMAGE_SOURCE + "/X_train.npy")
    net.X_valid = np.load(params.IMAGE_SOURCE + "/X_valid.npy")
    net.y_train = np.load(params.IMAGE_SOURCE + "/y_train.npy")
    net.y_valid = np.load(params.IMAGE_SOURCE + "/y_valid.npy")

    return net, X, y
from matplotlib import pyplot as plt
np.set_printoptions(threshold=np.nan)

########################
## This is the validation script for RGB-D. The creation and cropping of the RGBD images are done in craete_rgbd_hdf5.py because the "images" cannot be stored in the intermediate step between creation/cropping and moving to hdf5
## Some directories may need to be created before running some validation segments if they do not exist
## All preprocessing of the labels and centers is identical to that of depth therefore there is no need to redo it..
########################

dataset = 'rgbd'
phase = 'test'
root_dir = '/home/bilbeisi/REN/'

############################# Create RGB-D Images #################################
names = util.load_names('fpad', phase)
labels = util.load_labels('fpad', phase)
cnames = util.load_names('fpac', phase)
centers = util.load_centers('fpad', phase).astype(float)
imgs = np.zeros((len(names), 4, 96, 96), dtype=np.float32)
lbls = np.zeros((len(labels), 63), dtype=np.float)

cube_size = 150  # cube size in mm for cropping

for idx, name in enumerate(names):
    if idx % 1000 == 0:
        cname = cnames[idx]
        img = util.load_image('fpad', os.path.join('/home/bilbeisi/REN/',
                                                   name))
        img[img == 0] = 1

        cimg = util.load_image('fpac',
예제 #15
0
# Remember to change dataset depending on type of data (fpad for depth, and fpac for rgb)
# Run for both phase='test' and phase='train' to get both datafiles
##################################

################ Confirm dataset before running this script! ###############################
dataset = 'fpad'  # fpac or fpad

DIR = "../labels/"
phase = 'test'  # test/train
h5_fn = os.path.join(DIR, (dataset + '_' + phase + '_data.h5'))

############## Dir containing preprocessed images ###############################
base_dir = '/home/bilbeisi/REN/cropped/'

names = util.load_names(dataset, phase)
labels = util.load_labels(dataset, phase)

if dataset == 'fpad':
    imgs = np.zeros((len(names), 1, 96, 96), dtype=np.float32)  # depth
else:
    imgs = np.zeros((len(names), 3, 96, 96), dtype=np.float32)
lbls = np.zeros((len(labels), 63), dtype=np.float)

for idx, name in enumerate(names):
    if dataset == 'fpac':
        name = name.replace('.jpeg', '.png')
    img = util.load_image(dataset, os.path.join(base_dir, name))
    img = img.astype(float)
    # revert back to normalized -1,1 since images where saved in 0,255 to allow viewing/verifying
    img[:] *= 2
    img[:] /= 255
def predict(model_id, raw, validation, train, n_eyes, average_over_eyes):
	params.DISABLE_CUDNN = True
	params.MULTIPROCESS = False

	d = importlib.import_module("nets.net_" + model_id)
	model, X, y = d.define_net()
	model.load_params_from(params.SAVE_URL + "/" + model_id + "/best_weights")

	f = get_iter_func(model)

	# Decrease batch size because TTA increases it 16-fold
	# Uses too much memory otherwise
	params.BATCH_SIZE = 8

	io = ImageIO()
	mean, std = io.load_mean_std()

	if validation or train:
		y = util.load_labels()
	else:
		y = util.load_sample_submission()

	keys = y.index.values

	tta_bi = TTABatchIterator(keys, params.BATCH_SIZE, std, mean, cv = validation or train, n_eyes = n_eyes)
	print "TTAs per image: %i, augmented batch size: %i" % (tta_bi.ttas, tta_bi.ttas * params.BATCH_SIZE * n_eyes)

	if validation:
		X_test = np.load(params.IMAGE_SOURCE + "/X_valid.npy")
	elif train:
		X_test = np.load(params.IMAGE_SOURCE + "/X_train.npy")
	else:
		X_test = np.arange(y.shape[0])

	padded_batches = ceil(X_test.shape[0]/float(params.BATCH_SIZE))

	pred = get_activations(X_test, tta_bi, f)
	
	concat_preds = []

	for batch_pred in pred:
		hidden = batch_pred[0]
		output = batch_pred[1]

		concat = np.concatenate([output, hidden], axis = 1)

		#if average_over_eyes:
			#means = concat.reshape(concat.shape[0] / 2, 2, concat.shape[1])
			#means = means.mean(axis = 1)
			#means = np.repeat(means, 2, axis = 0)

		concat_preds.append(concat)

	pred = np.vstack(concat_preds)
	output_units = pred.shape[1]

	#pred = model.predict_proba(X_test)
	pred = pred.reshape(padded_batches, tta_bi.ttas, params.BATCH_SIZE, output_units)
	pred = np.mean(pred, axis = 1)
	pred = pred.reshape(padded_batches * params.BATCH_SIZE, output_units)

	# Remove padded lines
	pred = pred[:X_test.shape[0]]

	# Save unrounded
	#y.loc[keys] = pred

	if validation:
		filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_validation.npy"
	elif train:
		filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_train.npy"
	else:
		filename = params.SAVE_URL + "/" + model_id + "/raw_predictions_test.npy"

	np.save(filename, pred)
	#y.to_csv(filename)
	print "Saved raw predictions to " + filename

	if not raw and not validation and not train:
		W = np.load(params.SAVE_URL + "/" + model_id + "/optimal_thresholds.npy")

		pred = weighted_round(pred, W)

		pred = pred[:, np.newaxis] # add axis for pd compatability

		hist, _ = np.histogram(pred, bins=5)
		print "Distribution over class predictions on test set: ", hist / float(y.shape[0])

		y.loc[keys] = pred

		y.to_csv(params.SAVE_URL + "/" + model_id + "/submission.csv")

		print "Gzipping..."

		if not params.ON_COMA:
			call("gzip -c " + params.SAVE_URL + "/" + model_id + "/submission.csv > " + params.SAVE_URL + "/" + model_id + "/submission.csv.gz", shell=True)

		print "Done! File saved to models/" + model_id + "/submission.csv"
예제 #17
0
def singlePipeline(nr_centroids, nr_it, 
                   label_path = "../data/preprocessed.h5", 
                   clsfr = "SGD", 
                   calc_centroids = True, 
                   dogfeed=True, 
                   train_model=True,
                   cache_size=4000,
                   degree=3,
                   tol=1e-3,
                   max_iter=-1,
                   kernel='rbf',
                   model_file='UNSPECIFIED'):
    
    
    
    if calc_centroids:
        print "calculating centroids..."
        #Finds the features using kmeans
        kmTrainer = kmeans.kMeansTrainer(nr_centroids = nr_centroids, nr_it = nr_it)    
        centroids = kmTrainer.fit()
        kmTrainer.save_centroids(centroids)
        
        print "calculating activations..."
        #Calculates the activaiton of the test set
        act_calc = act.ActivationCalculation()
        features = act_calc.pipeline(centroids)  
    else:
        print "loading activations from file..."
        #loads feature data
        feature_data = h5py.File("../data/activations_train/"+str(nr_centroids)+"activationkmeans.h5")
        features = feature_data["activations"]

    
    

    
    print "Loading labels from file..."
    #get the labels
    labels = util.load_labels(label_path)
    label_names = util.load_label_names(label_path)
    print "Got labels"

    
    
    if clsfr == "SGD": 
        if train_model:
            #Train the SGD classifier
            print "Begin training of SGD..."
            train.trainSGD(features, labels, nr_centroids)
            print "Training done"
        
        if not dogfeed:
            return
        
        print "Dogfeeding"
        #Predict based on SGD training
        print "Begin SGD predictions..."
        classified = classifier.predict(features, nr_centroids, degree=degree, cache_size=cache_size)
        print "Predicting done"        
        
    elif clsfr == "SVC" or clsfr == "NUSVR": 
        
        if train_model:
            print "Begin training of Model..."
            if clsfr=="SVC":
                #Train SVC classifier
                model = svc.train_svc(features, 
                                      labels, 
                                      nr_centroids,
                                      degree=degree,
                                      cache_size=cache_size,
                                      tol=tol,
                                      max_iter=max_iter,
                                      kernel = kernel)
            else :
                #Train SVC classifier
                model = svc.train_svc(features, 
                                      labels, 
                                      nr_centroids,
                                      degree=degree,
                                      cache_size=cache_size,
                                      tol=tol,
                                      max_iter=max_iter,
                                      kernel=kernel)    
            print "Training done"
        else:
            print "Loading model"
            model = joblib.load(model_file)
            
        
        if not dogfeed:
            return   
        
        print "Dogfeeding"
        #Predict based on SVC training
        print "Begin SVC predictions..."
        classified = model.predict_proba(features)
        print "Predicting done"
        
    
    else:
        print "Selected classifier not available, please use an available classifier"
        return
       
    
       
       
    print "Calculating log loss..."
    summing = 0
    correct = 0
    
    np.savetxt("meuk.csv", classified, delimiter=";")
    
    loss = metrics.log_loss(labels, classified)
    print loss

    print -np.mean(np.log(classified)[np.arange(len(labels)), labels])
    
    #calculate the log loss
    for i, label in enumerate(labels):
        
        actual = labels[i]   
        
        
        if(classified[i][label] == 0):
            summing+= np.log(10e-15)
        else:
            summing+= np.log(classified[i][label])
        if actual == np.argmax(classified[i]):
            correct += 1

    image = np.zeros((len(label_names),len(labels)))  
    
    for j, label_index in enumerate(labels):
        image[label_index,j] = 1

    scipy.misc.imsave('correct.png', image)
    scipy.misc.imsave('predicted.png', classified.T)
    
    error = image - classified.T
    
    scipy.misc.imsave('error.png', error)
    
    
    print "Calculation finished"  

    summing = -summing/len(labels)
    print "log loss: ", summing 
    print "correct/amount_of_labels: ", correct/len(labels)
    print "lowest classification score: ", np.min(classified)
   
#    print summing
    np.savetxt( "realLabel.csv", labels, delimiter=";")
   # np.savetxt( "SGD_label.csv", max_SGD, delimiter=";")  
    
    if calc_centroids is False:
        feature_data.close()       
def define_net():
    define_net_specific_parameters()
    io = ImageIO()

    # Read pandas csv labels
    y = util.load_labels()

    if params.SUBSET is not 0:
        y = y[:params.SUBSET]

    X = np.arange(y.shape[0])

    mean, std = io.load_mean_std(circularized=params.CIRCULARIZED_MEAN_STD)
    keys = y.index.values

    if params.AUGMENT:
        train_iterator = AugmentingParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y)
    else:
        train_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y)

    test_iterator = ParallelBatchIterator(keys, params.BATCH_SIZE, std, mean, y_all = y)

    if params.REGRESSION:
        y = util.float32(y)
        y = y[:, np.newaxis]

    if 'gpu' in theano.config.device:
        # Half of coma does not support cuDNN, check whether we can use it on this node
        # If not, use cuda_convnet bindings
        from theano.sandbox.cuda.dnn import dnn_available
        if dnn_available():
            from lasagne.layers import dnn
            Conv2DLayer = dnn.Conv2DDNNLayer
            MaxPool2DLayer = dnn.MaxPool2DDNNLayer
        else:
            from lasagne.layers import cuda_convnet
            Conv2DLayer = cuda_convnet.Conv2DCCLayer
            MaxPool2DLayer = cuda_convnet.MaxPool2DCCLayer
    else:
        Conv2DLayer = layers.Conv2DLayer
        MaxPool2DLayer = layers.MaxPool2DLayer

    Maxout = layers.pool.FeaturePoolLayer

    net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv0', Conv2DLayer),
            ('pool0', MaxPool2DLayer),
            ('conv1', Conv2DLayer),
            ('pool1', MaxPool2DLayer),
            ('conv2', Conv2DLayer),
            ('pool2', MaxPool2DLayer),
            ('conv3', Conv2DLayer),
            ('pool3', MaxPool2DLayer),
            ('conv4', Conv2DLayer),
            ('pool4', MaxPool2DLayer),
            ('dropouthidden1', layers.DropoutLayer),
            ('hidden1', layers.DenseLayer),
            ('maxout1', Maxout),
            ('dropouthidden2', layers.DropoutLayer),
            ('hidden2', layers.DenseLayer),
            ('maxout2', Maxout),
            ('dropouthidden3', layers.DropoutLayer),
            ('output', layers.DenseLayer),
        ],

        input_shape=(None, params.CHANNELS, params.PIXELS, params.PIXELS),

        conv0_num_filters=32, conv0_filter_size=(5, 5), conv0_stride=(2, 2), pool0_pool_size=(2, 2), pool0_stride=(2, 2),
        conv1_num_filters=64, conv1_filter_size=(5, 5), conv1_border_mode = 'same', pool1_pool_size=(2, 2), pool1_stride=(2, 2),
        conv2_num_filters=128, conv2_filter_size=(3, 3), conv2_border_mode = 'same', pool2_pool_size=(2, 2), pool2_stride=(2, 2),
        conv3_num_filters=192, conv3_filter_size=(3, 3), conv3_border_mode = 'same', pool3_pool_size=(2, 2), pool3_stride=(2, 2),
        conv4_num_filters=256, conv4_filter_size=(3, 3), conv4_border_mode = 'same', pool4_pool_size=(2, 2), pool4_stride=(2, 2),

        hidden1_num_units=1024,
        hidden2_num_units=1024,

        dropouthidden1_p=0.5,
        dropouthidden2_p=0.5,
        dropouthidden3_p=0.5,

        maxout1_pool_size=2,
        maxout2_pool_size=2,

        output_num_units=1 if params.REGRESSION else 5,
        output_nonlinearity=None if params.REGRESSION else nonlinearities.softmax,

        update_learning_rate=theano.shared(util.float32(params.START_LEARNING_RATE)),
        update_momentum=theano.shared(util.float32(params.MOMENTUM)),
        custom_score=('kappa', quadratic_kappa),

        regression=params.REGRESSION,
        batch_iterator_train=train_iterator,
        batch_iterator_test=test_iterator,
        on_epoch_finished=[
            AdjustVariable('update_learning_rate', start=params.START_LEARNING_RATE),
            stats.Stat(),
            ModelSaver()
        ],
        max_epochs=500,
        verbose=1,

        # Only relevant when create_validation_split = True
        eval_size=0.1,

        # Need to specify splits manually like indicated below!
        create_validation_split=params.SUBSET>0,
    )

    # It is recommended to use the same training/validation split every model for ensembling and threshold optimization
    #
    # To set specific training/validation split:
    net.X_train = np.load(params.IMAGE_SOURCE + "/X_train.npy")
    net.X_valid = np.load(params.IMAGE_SOURCE + "/X_valid.npy")
    net.y_train = np.load(params.IMAGE_SOURCE + "/y_train.npy")
    net.y_valid = np.load(params.IMAGE_SOURCE + "/y_valid.npy")

    return net, X, y
예제 #19
0

if __name__ == "__main__":

    # # Serve the app with gevent
    # http_server = WSGIServer(("0.0.0.0", 5000), app)
    # http_server.serve_forever()

    MODEL_PATH = "models/model_cpc_1.tflite"
    LABELS_PATH = "models/imageLabels.txt"

    interpreter = tf.lite.Interpreter(
      model_path = MODEL_PATH)
    interpreter.allocate_tensors()

    labels = load_labels(LABELS_PATH)

    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    print(f"INPUT {input_details}")
    print(f"OUTPUT {output_details}")

    floating_model = input_details[0]['dtype'] == np.float32

    # NxHxWxC, H:1, W:2
    height = input_details[0]['shape'][1]
    width = input_details[0]['shape'][2]
    input_mean = 127.5
    input_std = 127.5

    print("Running server on http://127.0.0.1:5000/")
예제 #20
0
파일: similarity.py 프로젝트: Shwam/Birds
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)

    mfcc = dict(
        zip([metadata[i][0] for i in range(1, len(metadata))],
            util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [
            np.concatenate((F[item[0]], mfcc[item[0]]), axis=0)
            for item in metadata[1:]
        ]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    #X = util.load_features((dataset + "_features") if dataset else None)
    for X in [X1, X2]:
        labels = []
        avg_mat = None
        all_sims = dict()
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(
            len(X))  #range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = range(len(X))  #samples
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]

        N_ESTIMATORS = 80
        NUM_RUNS = 5

        for run in range(NUM_RUNS):
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS,
                                         max_features=25,
                                         oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i, j)] = similarity.get(
                                (i, j), 0) + (1 / N_ESTIMATORS)

            species_similarity = dict()
            for i in samps:
                for j in samps:
                    species_similarity[(Y[i], Y[j])] = species_similarity.get(
                        (Y[i], Y[j]), 0) + similarity.get(
                            (i, j), 0)**2 / (Y.count(Y[i]) * Y.count(Y[j]))

            for k in species_similarity:
                species_similarity[k] = species_similarity[k]**(0.5)

            labels = clf.classes_
            for i in range(len(labels)):
                normal = species_similarity[(labels[i], labels[i])]
                for j in range(i, len(labels)):
                    k = labels[i], labels[j]
                    species_similarity[k] /= normal
                    species_similarity[(k[1], k[0])] = species_similarity[k]
                    all_sims[k] = all_sims.get(
                        k, 0) + species_similarity[k] / NUM_RUNS

            mat = np.array([[(1.0 - species_similarity.get((i, j), 0))**2
                             for j in labels] for i in labels])
            print(mat)
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)
        avg_mat = avg_mat / NUM_RUNS
        print(avg_mat)
        for k in all_sims:
            if k[0] != k[1] and all_sims[k] > 0.1:
                print("{}\t{}\t{}".format(k[0], k[1], all_sims[k]))
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix,
                   color_threshold=0.65,
                   labels=labels,
                   show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
예제 #21
0
def main(data_path, out_path, labelfile, resume, batch_size, epochs,
         resnet_depth, train):
    try:
        if not os.path.isdir(out_path):
            os.makedirs(out_path)

        training_data, validation_data = load_data(data_path)
        imChannels, _, _ = training_data[0][0].shape

        # load labels in decoder format
        labels, alpha_len = load_labels(labelfile)

        if resume is None:  # start training from beginning
            model = Model(out_path, resnet_depth, imChannels, alpha_len,
                          labels)

            # weight_decay == l2 lambda
            # SGD tends to get better end-results
            # Learning-rate is reduced on plateau, check model.py for details.
            optimizer = optim.SGD(model.parameters(),
                                  lr=1e-4,
                                  weight_decay=0.2)
            #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.2)

            # save the  summary of the model
            with open(out_path + os.sep + "modelsummary_ctc.txt", 'w') as f:
                with redirect_stdout(f):
                    print(str(model))

        else:  # resume from checkpoint
            if resume == "best":
                print("Resuming from best checkpoint")
                checkpoint = torch.load(
                    os.path.join(out_path, 'checkpoint_best.pth.tar'))
            else:
                print("Resuming from last checkpoint")
                checkpoint = torch.load(
                    os.path.join(out_path, 'checkpoint.pth.tar'))

            model = Model(checkpoint['model_params']['output'],
                          checkpoint['model_params']['resnet_depth'],
                          checkpoint['model_params']['imChannels'],
                          checkpoint['model_params']['alphabet_length'],
                          checkpoint['model_params']['labels'],
                          last_epoch=checkpoint['epoch'])

            model.load_state_dict(checkpoint['model_states'])

            optimizer = optim.SGD(model.parameters(),
                                  lr=1e-4,
                                  weight_decay=0.2)
            #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.2)

            optimizer.load_state_dict(checkpoint['optimizer'])

            # optimizer states have to be moved to GPU manually
            if torch.cuda.is_available():
                for state in optimizer.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.cuda()

        # logger visualizes training process, can be followed during training
        logger = HistoryLogger(model, out_path, batch_size, epochs,
                               validation_data[0], validation_data[1],
                               training_data[0], training_data[1])

        if train:
            model.fit(training_data,
                      validation_data,
                      optimizer,
                      batch_size=batch_size,
                      epochs=epochs,
                      logger=logger)

        torch.save(model, out_path + os.sep + "model_final.pth.tar")

    except Exception as err:
        print(err.args)
        raise
#     lbls[lid] = np.reshape(joints, (63))
#
# lbls = np.reshape(lbls,(-1,63))
#
# x = util.normalize_pose(dataset, lbls, centers, 150, fx, fy)
#
# util.save_results(x, out_file)
# ################################################################################



########################### Test RGB Normalized joints: norm to 2D pixel to 3D World back to 2D pixel and plot #################################
### Test label normalization by projecting the normalized joints onto some RGB image samples
### this segment is only for validation
#############################################################################################################################################
lbls = util.load_labels(dataset,phase) ### load test/train data
names = util.load_names(dataset,phase)
centers = util.load_centers(dataset,phase).astype(float)
fx, fy, ux, uy =  util.get_param(dataset)

lbls = [s.split() for s in lbls]
lbls = np.reshape(np.asarray(lbls, dtype=np.float32),(-1,63))
lbls = util.transform_pose(dataset, lbls, centers, 150, fx, fy) # norm to 2D pixel

centers = np.reshape(centers,(-1,3))

for idx, name in enumerate(names):
    if idx%1000 == 0:
        lbl = util.pixel2world(lbls[idx], dataset) # pixel to 3D world
        lbl, skel_camcoords = util.world2pixel(lbl, dataset) # back to 2d pixel from 3D world
        img = util.load_image(dataset, os.path.join(root_dir, name))
예제 #23
0
########################
## This is the preprocessing and validation script for Depth.
## Keep the segment you would like to use and comment out the rest
## this is because there are multiple data files (labels before/after normalization) that will cause conflicts
########################

dataset = 'fpad'
phase = 'train'  ## test/train
root_dir = '/home/bilbeisi/REN/'

############################ Draw pose on depth samples #################################
### draw pose on some depth samples to validate world2pixel and image/label loading
### this segment is only for validation
##############################################################################################
lbls = util.load_labels(dataset, phase)  ### load test/train data
names = util.load_names(dataset, phase)
centers = util.load_centers(dataset, phase).astype(float)

for idx, name in enumerate(names):
    if idx % 1000 == 0:
        lbl = np.asarray(np.reshape(lbls[idx].split(), (21, 3)),
                         dtype=np.float32)
        lbl, skel_camcoords = util.world2pixel(lbl, dataset)
        img = util.load_image(dataset, os.path.join(root_dir, name))
        img /= 1160
        img *= 255
        points = centers[idx]
        img = util.draw_pose(dataset, img, lbl, 3, (255, 0, 0), points)
        cv2.imwrite(
            root_dir + 'samples/depth/' + phase + '_' + str(idx) + '.png', img)
예제 #24
0
def train_classifier(feature_name, train_batch_num, base_npz_dir,
                     test_batches):
    test_acc = []
    base_path = util.get_base_path()
    categories = util.get_categories()
    train_batches = range(0, train_batch_num)
    #test_batches = range(train_batch_num,train_batch_num+1) JC edit
    set_name = 'setb50k'
    label_set_name = set_name
    subset = ''  #'_pca1'
    classifier_paramstring = ''
    if do_norm: classifier_paramstring += 'N'
    if props['C'] != 0:
        classifier_paramstring += 'C%d' % props['C']
    out_fn = os.path.join(
        base_npz_dir, feature_name, '%s%s_%s%s_%d-%d.pickle' %
        (classifier_type, classifier_paramstring, set_name, subset,
         train_batches[0], train_batches[-1]))
    if do_norm:
        out_fn_norm = os.path.join(
            base_npz_dir, feature_name,
            'norm_%s%s_%d.pickle' % (set_name, subset, train_batches[0]))
    print 'Training %s...' % out_fn

    if classifier_type == 'sgd_svm':
        is_incremental = True
    else:
        is_incremental = False

    norm = dict()
    clf = None

    for i_batch, train_batch in enumerate(train_batches + test_batches):
        fn = os.path.join(base_npz_dir, feature_name,
                          '%s_%05d%s.npz' % (set_name, train_batch, subset))
        print 'Processing feature file %s.' % fn
        print fn
        with np.load(fn) as file_contents:

            data = file_contents['data']

        true_labels, _ = util.load_labels(label_set_name, train_batch)

        if do_norm:
            if i_batch == 0:
                # Initial batch to determine mean and variance for normalization
                norm['mean'] = np.expand_dims(data.mean(axis=0), 0)
                norm['std'] = np.expand_dims(data.std(axis=0), 0)
                norm['std'] = np.maximum(norm['std'], 0.01)
                with open(out_fn_norm, 'wb') as fid:
                    pickle.dump(norm, fid)

            data -= norm['mean']
            data /= norm['std']
            print 'Data after normalization: Mean %f, Std %f' % (data.mean(
                axis=0).mean(axis=0), data.std(axis=0).mean(axis=0))

        if is_incremental:
            # Incremental: Do training every training iteration
            # Do testing not just on test but also during training before feeding the new training data
            do_train = (i_batch < len(train_batches))
            do_test = (i_batch > 0)
            use_data = data
            use_true_labels = true_labels
        else:
            # Non-incremental: Train once when all training batches have been collected
            do_train = (i_batch == len(train_batches) - 1)
            do_test = (i_batch >= len(train_batches))
            # data collection phase
            if not do_test:
                if i_batch == 0:
                    data_all = data
                    all_true_labels = true_labels
                else:
                    data_all = np.concatenate((data_all, data), axis=0)
                    all_true_labels = np.concatenate(
                        (all_true_labels, true_labels), axis=0)
            use_data = data_all
            use_true_labels = all_true_labels

        print '  use data %s.' % str(use_data.shape)
        print '  use labels %s' % str(use_true_labels.shape)

        if do_test:
            # After some batch training has been done, predict performance
            pred_labels = clf.predict(data)
            acc = float(sum(pred_labels == true_labels)) / true_labels.size
            test_acc.append(acc)
            print '  Batch accuracy: %.1f%%' % (acc * 100)

        if do_train:
            if classifier_type == 'sgd_svm':
                clf = train_sgd(clf, 'hinge', use_data, use_true_labels)
            elif classifier_type == 'svm':
                clf = train_svm(clf, use_data, use_true_labels, props)
            pred_labels = clf.predict(use_data)
            acc = float(
                sum(pred_labels == use_true_labels)) / use_true_labels.size
            print '  Train accuracy: %.1f%%' % (acc * 100)
            # Dump classifier data at every iteration
            with open(out_fn, 'wb') as fid:
                pickle.dump(clf, fid)
    return np.mean(test_acc)