示例#1
0
def load(subset=None, min_occ=1, min_files=1):
    """
    Loads the raw text data that constitutes the Microsoft
    Sentence Completion Challenge (stored in ./data/).
    Processes the data, tokenizes and parses it, and returns
    the results.

    Returned is a tuple (train_sents, question_groups, answers,
    feature_sizes).
    The 'train_sents' numpy array of shape (token_count, feature_count).
    Features colums are at first textual (orth, lemma, lemma_4),
    then syntactic (pos, dependency-type). The [-2] column contains
    syntactic-parent-indices, and the [-1] column denotes to which
    sentence the token belongs. The 'question_groups' object
    is an iterable question groups. Each group consists of 5
    sentences (one of which is correct). Each sentence is a
    parsed-text-array as described above. The 'answers'
    object is a numpy array of shape (question_group_count, )
    that contains the indices of the correct sentences in
    question groups. The 'feature_sizes' object is a numpy
    array contaning the dimensionality of each feature.

    :param subset: The number of training files to process.
        If None (default), all of the files are processed.
    :param min_occ: Miniumum required number of occurences of
        a token (word) required for it to be included in the vocabulary.
        Default value (1) uses all words that occured in the trainset.
    :param min_files: Minumum required number of files a term has to
        occur in for it to be included in the vocabulary.
    """
    dir = os.path.join("data", "processed")
    if not os.path.exists(dir):
        os.makedirs(dir)
    name_base = "subset_%r-min_occ_%r-min_files_%r" % (subset, min_occ,
                                                       min_files)

    #   look for the cached processed data, return if present
    file_name = os.path.join(dir, name_base + ".pkl")
    data = util.try_pickle_load(file_name)
    if data is not None:
        return data

    #   did not find cached data, will have to process it
    #   log the loading process also to a file
    log_name = os.path.join(dir, name_base + ".log")
    log.addHandler(logging.FileHandler(log_name))

    #   process the data, cache it and return
    data = _load(subset, min_occ, min_files)
    util.try_pickle_dump(data, file_name)
    return data
示例#2
0
def main(x_path, y_path):
    x = try_pickle_load(x_path)
    y = try_pickle_load(y_path)
    print "Shape of loaded x data is", x.shape
    print "Shape of loaded y data is", y.shape
    assert (x.shape[0] == y.shape[0])

    test_size = int(x.shape[0] * TEST_SIZE)
    train_size = x.shape[0] - test_size
    assert (train_size + test_size == x.shape[0])
    print "Train size", train_size
    print "Test size", test_size

    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)

    train_ind = indices[:train_size]
    test_ind = indices[train_size:]

    train_set_x = x[train_ind]
    test_set_x = x[test_ind]
    train_set_y = y[train_ind]
    test_set_y = y[test_ind]

    folder_name = os.path.split(x_path)[0]
    print "Folder to save", folder_name

    try_pickle_dump(train_set_x, os.path.join(folder_name, "x_train.bin"))
    try_pickle_dump(test_set_x, os.path.join(folder_name, "x_test.bin"))
    try_pickle_dump(train_set_y, os.path.join(folder_name, "y_train.bin"))
    try_pickle_dump(test_set_y, os.path.join(folder_name, "y_test.bin"))

    print "Done"
def main(x_path, y_path):
    x = try_pickle_load(x_path)
    y = try_pickle_load(y_path)
    print "Shape of loaded x data is", x.shape
    print "Shape of loaded y data is", y.shape
    assert(x.shape[0] == y.shape[0])

    test_size = int(x.shape[0] * TEST_SIZE)
    train_size = x.shape[0] - test_size
    assert(train_size + test_size == x.shape[0])
    print "Train size", train_size
    print "Test size", test_size

    indices = np.arange(x.shape[0])
    np.random.shuffle(indices)

    train_ind = indices[:train_size]
    test_ind = indices[train_size:]

    train_set_x = x[train_ind]
    test_set_x = x[test_ind]
    train_set_y = y[train_ind]
    test_set_y = y[test_ind]

    folder_name = os.path.split(x_path)[0]
    print "Folder to save", folder_name

    try_pickle_dump(train_set_x, os.path.join(folder_name, "x_train.bin"))
    try_pickle_dump(test_set_x, os.path.join(folder_name, "x_test.bin"))
    try_pickle_dump(train_set_y, os.path.join(folder_name, "y_train.bin"))
    try_pickle_dump(test_set_y, os.path.join(folder_name, "y_test.bin"))

    print "Done"
def faces(fold):
    """
    Retrieves a list of face images. Images are numpy arrays
    of (img_height, img_width, RGB) shape. The images represent
    the clipped and masked face images from the given fold
    of the FDDB database.

    :param fold: int indicating which fold is
        desired. In [1, 10] range.
    """
    log.info("Retrieving face images for fold %s", str(fold))

    #   generate file name in which this fold's face images are stored
    faces_file_name = os.path.join(
        FACE_ONLY_ROOT, "fddb_facesonly_fold_{:02d}.zip".format(fold))

    #   try to load and return pickled data
    face_images = util.try_pickle_load(faces_file_name, zip=False)
    if face_images is not None:
        return face_images

    #   resulting face images
    #   each image is a numpy array of RGB components of
    #   (img_height, img_width, 3) shape
    face_images = []

    #   go through all the photos in the fold
    #   and their FDDB elipsis info (face annotations)
    for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items():

        log.info("Processing photo %s", photo_path)

        #   load photo
        log.debug("Loading photo")
        photo_RGB = cv2.imread(photo_path, 1)

        #   for each elipse info get mask and bbox
        for mask, bbox in zip(masks, bboxes):

            #   apply the bounding box
            log.debug("Applying mask and bounds")
            face_img = np.array(photo_RGB[bbox[0][0]:bbox[1][0],
                                          bbox[0][1]:bbox[1][1], :])

            #   apply the mask
            face_mask = mask[bbox[0][0]:bbox[1][0], bbox[0][1]:bbox[1][1]]
            face_img[np.logical_not(face_mask), :] = 0

            #   store the image
            face_images.append(face_img)

    #   store image data for subsequent usage
    if not util.try_pickle_dump(face_images, faces_file_name, zip=False):
        raise "Failed to pickle face images"

    return face_images
def image_face_masks_bboxes(fold):
    """
    Returns a dictionary in which keys are file
    paths of images belonging to the fold.
    Values are tuples (masks, bboxes) where "masks"
    are lists of face-elipse booleam masks for that image
    and "bboxes" are bounding box info for that image.
    The returned dictionary is ordered the same way
    the elisis info file is.
    """
    log.info("Retrieving image masks for fold %s", str(fold))

    #   file name of the cached version
    masks_file_name = os.path.join(
        FACE_MASK_ROOT, "fddb_face_masks_fold{:02d}.zip".format(fold))

    #   try to load and return pickled data
    masks = util.try_pickle_load(masks_file_name, zip=False)
    if masks is not None:
        return masks

    #   there is no pickled version, we need to create the masks
    masks_dict = collections.OrderedDict()

    for photo_path, elipses in image_elipses(fold).items():
        log.info("Processing photo %s", photo_path)

        #   load photo
        log.debug("Loading photo")
        photo_RGB = cv2.imread(photo_path, 1)
        photo_shape = photo_RGB.shape[:2]

        #   for each elipse info get mask and bbox, and store them
        #   first prepare the numpy arrays in which they are stored
        masks = np.zeros((len(elipses), photo_shape[0], photo_shape[1]),
                         dtype=np.bool)
        bboxes = np.zeros((len(elipses), 2, 2), dtype=np.int32)
        #   then out those arrays into the dict
        masks_dict[photo_path] = (masks, bboxes)
        #   and then fill up the arrays with real data
        for elipse_ind, elipse in enumerate(elipses):

            log.debug("Calculating mask and bounds")
            mask, bbox = __elipsis_mask_and_box(photo_shape, elipse)
            masks[elipse_ind] = mask
            bboxes[elipse_ind] = bbox

    #   store image data for subsequent usage
    if not util.try_pickle_dump(masks_dict, masks_file_name, zip=False):
        raise "Failed to pickle face masks"

    return masks_dict
def main(show=False):
    logger.info("... loading data")
    logger.debug("Theano.config.floatX is %s" % theano.config.floatX)

    #   samples is list of Sample objects
    samples = load_dataset(DATASET_PATH)
    samples = list(samples)

    #   use only subset of data TODO remove this
    # DATA_TO_USE = 60
    # samples = samples[:DATA_TO_USE]

    random.seed(23455)
    random.shuffle(samples)

    train_samples, test_samples = split_samples(samples, 0.1)
    del samples

    cc = ClassCounter()

    x_train = generate_x(train_samples)
    x_test = generate_x(test_samples)
    y_train = generate_targets(train_samples, cc)
    y_test = generate_targets(test_samples, cc)
    del train_samples
    del test_samples

    cc.log_stats()

    try_pickle_dump(x_train, OUT_PATH + "x_train.bin")
    try_pickle_dump(x_test, OUT_PATH + "x_test.bin")
    try_pickle_dump(y_train, OUT_PATH + "y_train.bin")
    try_pickle_dump(y_test, OUT_PATH + "y_test.bin")

    # print x_train[0][0, 0, 80:90, 80:90]
    # print x_test[0][0, 0, 80:90, 80:90]

    if show:
        n_imgs = 5
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 0 * n_imgs + j + 1)
            pylab.axis('off')
            pylab.imshow(x_train[0][j, 0, :, :])  # rgb
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 1 * n_imgs + j + 1)
            pylab.gray()
            pylab.axis('off')
            pylab.imshow(y_train[j, :, :])
        pylab.show()
def main(show=False):
    logger.info("... loading data")
    logger.debug("Theano.config.floatX is %s" % theano.config.floatX)

    #   samples is list of Sample objects
    samples = load_dataset(DATASET_PATH)
    samples = list(samples)

    #   use only subset of data TODO remove this
    # DATA_TO_USE = 60
    # samples = samples[:DATA_TO_USE]

    random.seed(23455)
    random.shuffle(samples)

    train_samples, test_samples = split_samples(samples, 0.1)
    del samples

    cc = ClassCounter()

    x_train = generate_x(train_samples)
    x_test = generate_x(test_samples)
    y_train = generate_targets(train_samples, cc)
    y_test = generate_targets(test_samples, cc)
    del train_samples
    del test_samples

    cc.log_stats()

    try_pickle_dump(x_train, OUT_PATH + "x_train.bin")
    try_pickle_dump(x_test, OUT_PATH + "x_test.bin")
    try_pickle_dump(y_train, OUT_PATH + "y_train.bin")
    try_pickle_dump(y_test, OUT_PATH + "y_test.bin")

    # print x_train[0][0, 0, 80:90, 80:90]
    # print x_test[0][0, 0, 80:90, 80:90]

    if show:
        n_imgs = 5
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 0 * n_imgs + j + 1)
            pylab.axis('off')
            pylab.imshow(x_train[0][j, 0, :, :])  # rgb
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 1 * n_imgs + j + 1)
            pylab.gray()
            pylab.axis('off')
            pylab.imshow(y_train[j, :, :])
        pylab.show()
def histograms(fold):
    """
    Generates YIQ component histograms for
    face and not-face parts of the images
    of the given fold(s) of the FDDB database.

    Returns a tuple (hist_face, hist_noface).

    :type fold: int or iterable of ints
    :param fold: When int: number of the fold of
        the FDDB database. When iterable: a number
        of folds for the FDDB database.
    """

    if not isinstance(fold, int):
        #   fold param is an iterable
        #   get individual fold histograms
        hists_face, hists_noface = zip(*[histograms(f) for f in fold])

        #   sum them up and return
        fold_count = len(hists_face)
        hist_face = sum(hists_face[1:], hists_face[0]) / fold_count
        hist_noface = sum(hists_noface[1:], hists_noface[0]) / fold_count
        return (hist_face, hist_noface)

    #   generate file name in which this fold's histograms are stored
    hist_file_name = os.path.join(
        HIST_ROOT, "fddb_YIQ_histogram_fold_{:02d}.pkl".format(fold))

    #   try to load and return pickled histogram data
    pickled_hist = util.try_pickle_load(hist_file_name)
    if pickled_hist is not None:
        return pickled_hist

    #   failed to load pickled data, create histograms

    #  prepare histograms
    #  first dimension indicates Y, I or Q,
    #  second dimension are bins
    hist_face = np.zeros((3, 256), np.int)
    hist_noface = np.zeros((3, 256), np.int)

    #   go through all the photos in the fold
    #   and their FDDB elipsis info (face annotations)
    for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items():

        log.info("Processing photo %s", photo_path)

        #   load photo, convert to YIO
        log.debug("Loading photo")
        photo_RGB = cv2.imread(photo_path, 1)
        log.debug("Converting to YIQ")
        photo_YIQ = util.rgb_to_yiq(photo_RGB)

        #   create masks from elipses and OR them into one mask
        log.debug("Creating faces mask")
        mask_face = masks.any(axis=0)
        mask_noface = np.logical_not(mask_face)

        #   add current image histograms to total histograms
        log.debug("Histogramming")
        for component_ind in range(3):
            hist_face[component_ind, :] += np.histogram(
                photo_YIQ[mask_face, component_ind],
                __bin_edges[component_ind]
            )[0]
            hist_noface[component_ind, :] += np.histogram(
                photo_YIQ[mask_noface, component_ind],
                __bin_edges[component_ind]
            )[0]

    #   normalize histograms
    hist_face = hist_face.astype(np.float) / hist_face[1, :].sum()
    hist_noface = hist_noface.astype(np.float) / hist_noface[1, :].sum()

    #   store histogram data for subsequent usage
    if not util.try_pickle_dump((hist_face, hist_noface), hist_file_name):
        raise "Failed to pickle histograms"

    return (hist_face, hist_noface)
def histograms(fold):
    """
    Generates YIQ component histograms for
    face and not-face parts of the images
    of the given fold(s) of the FDDB database.

    Returns a tuple (hist_face, hist_noface).

    :type fold: int or iterable of ints
    :param fold: When int: number of the fold of
        the FDDB database. When iterable: a number
        of folds for the FDDB database.
    """

    if not isinstance(fold, int):
        #   fold param is an iterable
        #   get individual fold histograms
        hists_face, hists_noface = zip(*[histograms(f) for f in fold])

        #   sum them up and return
        fold_count = len(hists_face)
        hist_face = sum(hists_face[1:], hists_face[0]) / fold_count
        hist_noface = sum(hists_noface[1:], hists_noface[0]) / fold_count
        return (hist_face, hist_noface)

    #   generate file name in which this fold's histograms are stored
    hist_file_name = os.path.join(
        HIST_ROOT, "fddb_YIQ_histogram_fold_{:02d}.pkl".format(fold))

    #   try to load and return pickled histogram data
    pickled_hist = util.try_pickle_load(hist_file_name)
    if pickled_hist is not None:
        return pickled_hist

    #   failed to load pickled data, create histograms

    #  prepare histograms
    #  first dimension indicates Y, I or Q,
    #  second dimension are bins
    hist_face = np.zeros((3, 256), np.int)
    hist_noface = np.zeros((3, 256), np.int)

    #   go through all the photos in the fold
    #   and their FDDB elipsis info (face annotations)
    for photo_path, (masks, bboxes) in image_face_masks_bboxes(fold).items():

        log.info("Processing photo %s", photo_path)

        #   load photo, convert to YIO
        log.debug("Loading photo")
        photo_RGB = cv2.imread(photo_path, 1)
        log.debug("Converting to YIQ")
        photo_YIQ = util.rgb_to_yiq(photo_RGB)

        #   create masks from elipses and OR them into one mask
        log.debug("Creating faces mask")
        mask_face = masks.any(axis=0)
        mask_noface = np.logical_not(mask_face)

        #   add current image histograms to total histograms
        log.debug("Histogramming")
        for component_ind in range(3):
            hist_face[component_ind, :] += np.histogram(
                photo_YIQ[mask_face, component_ind],
                __bin_edges[component_ind])[0]
            hist_noface[component_ind, :] += np.histogram(
                photo_YIQ[mask_noface, component_ind],
                __bin_edges[component_ind])[0]

    #   normalize histograms
    hist_face = hist_face.astype(np.float) / hist_face[1, :].sum()
    hist_noface = hist_noface.astype(np.float) / hist_noface[1, :].sum()

    #   store histogram data for subsequent usage
    if not util.try_pickle_dump((hist_face, hist_noface), hist_file_name):
        raise "Failed to pickle histograms"

    return (hist_face, hist_noface)
def main(conf, gen_func, n_layers, show=False):
    """
    conf: dictionary
        configuration dictionary, from json file
    gen_func: function
        function used for generating inputs to network
    n_layers: int
        number of layers of laplacian pyramid used as input
    show: bool
        if true, few parsed images will be shown as a result
    """
    logger.info("... loading data")
    logger.debug("Theano.config.floatX is %s" % theano.config.floatX)

    #   samples is list of Sample objects
    dataset_path = conf['training']['dataset-folder']
    samples = load_dataset(dataset_path)
    samples = list(samples)

    random.seed(conf['training']['shuffle-seed'])
    random.shuffle(samples)

    validation_size = float(conf['training']['validation-percent']) / 100.0
    train_samples, validation_samples = split_samples(samples, validation_size)
    del samples

    out_folder = conf['training']['out-folder']
    write_samples_log(train_samples,
                      os.path.join(out_folder, "samples_train.log"))
    write_samples_log(validation_samples,
                      os.path.join(out_folder, "samples_validation.log"))

    cc = ClassCounter()

    x_train = generate_x(train_samples, n_layers, gen_func)
    x_validation = generate_x(validation_samples, n_layers, gen_func)
    y_train = generate_targets(train_samples, cc)
    y_validation = generate_targets(validation_samples, cc)
    del train_samples
    del validation_samples

    try_pickle_dump(x_train, os.path.join(out_folder, "x_train.bin"))
    try_pickle_dump(x_validation, os.path.join(out_folder, "x_validation.bin"))
    try_pickle_dump(y_train, os.path.join(out_folder, "y_train.bin"))
    try_pickle_dump(y_validation, os.path.join(out_folder, "y_validation.bin"))

    #   if test data defined
    if 'test' in conf:
        logger.info("Found test configuration, generating test data")
        test_samples = load_dataset(conf['test']['dataset-folder'])
        test_samples = list(test_samples)
        write_samples_log(test_samples,
                          os.path.join(out_folder, "samples_test.log"))
        x_test = generate_x(test_samples, n_layers, gen_func)
        y_test = generate_targets(test_samples, cc)

        try_pickle_dump(x_test, os.path.join(out_folder, "x_test.bin"))
        try_pickle_dump(y_test, os.path.join(out_folder, "y_test.bin"))

    cc.log_stats()

    if show:
        #   show few parsed samples from train set
        n_imgs = 5
        for j in xrange(n_imgs):
            pylab.subplot(3, n_imgs, 0 * n_imgs + j + 1)
            pylab.axis('off')
            pylab.imshow(x_train[0][j, 0, :, :])  # Y
        for j in xrange(n_imgs):
            pylab.subplot(3, n_imgs, 1 * n_imgs + j + 1)
            pylab.gray()
            pylab.axis('off')
            pylab.imshow(x_train[0][j, 3, :, :])  # depth
        for j in xrange(n_imgs):
            pylab.subplot(3, n_imgs, 2 * n_imgs + j + 1)
            pylab.gray()
            pylab.axis('off')
            pylab.imshow(y_train[j, :, :])
        pylab.show()
示例#11
0
def eval_model(conf,
               train_fn,
               test_fn,
               n_train_batches,
               n_test_batches,
               layers,
               pre_fn=None,
               l_rate_wrapper=None):
    """
    Function for trainig and validating models

    n_epochs: dictionary
        configuration params
    train_fn: theano function
        training function
    test_fn: theano function
        validation function
    n_train_batches: int
        number of batches for training
    n_test_batches: int
        number of batches for validation
    layers: list
        list of layers, used to extract best params
    pre_fn: function
        function to be called before training
    l_rate_wrapper: UpdateParameters object
        learning rate wrapper object

    returns: (best_validation_error, best_iter, best_params)
        the best validation error, iteration and parameters
    """
    assert (type(conf) is dict)
    n_epochs = conf['epochs']
    if n_epochs < 0:
        n_epochs = maxint

    # how often to lower learning rate if no improvement
    epochs_check_learn_rate = None
    if 'learning-rate-decrease-params' in conf:
        lrdp_params = conf['learning-rate-decrease-params']
        epochs_check_learn_rate = lrdp_params['no-improvement-epochs']
        min_learning_rate = lrdp_params['min-learning-rate']

    # file for dumping weights
    now = datetime.now()
    weights_filename = "network-%d-%d.bin" % (now.hour, now.minute)

    logger.info('... training')

    #   early-stopping parameters
    # look as this many iterations regardless
    patience = n_train_batches * 20  # skip first 20 epochs
    # wait this much longer when a new best is found
    patience_increase = 1.1
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.998
    # go through this many minibatche before checking the network
    # on the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2)
    logger.debug('Validation frequency is %d' % validation_frequency)

    set_layers_training_mode(layers, 1)

    best_validation_loss = numpy.inf
    best_iter = 0
    best_epoch = 0  # best epoch for train cost
    best_params = []
    best_train_cost = numpy.inf

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        # function to be called before every epoch
        if pre_fn is not None:
            pre_fn()

        training_costs = numpy.zeros((n_train_batches), dtype='float32')
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            cost_ij = train_fn(minibatch_index)
            training_costs[minibatch_index] = cost_ij
            # logger.info('training @ iter = %d, cost %f' % (iter, cost_ij))
            stdout.write('.')
            stdout.flush()

            if (iter + 1) % validation_frequency == 0:
                stdout.write('\n')  # newline after iteration dots

                set_layers_training_mode(layers, 0)

                # compute zero-one loss on validation set
                validation = [test_fn(i) for i in xrange(n_test_batches)]
                set_layers_training_mode(layers, 1)

                validation_losses = [v[0] for v in validation]
                validation_costs = [v[1] for v in validation]

                # class accuracies
                correct = numpy.zeros((layers[0].n_classes), dtype='int32')
                total = numpy.zeros((layers[0].n_classes), dtype='int32')
                for v in validation:
                    correct += v[2]
                    total += v[3]
                validation_class_accuracy = calc_class_accuracy(correct, total)

                this_validation_loss = numpy.mean(validation_losses)
                logger.info(
                    'epoch %i, minibatch %i/%i, validation error %f %%', epoch,
                    minibatch_index + 1, n_train_batches,
                    this_validation_loss * 100.)
                logger.info('validation cost: %f',
                            numpy.mean(validation_costs))
                logger.info('mean class accuracy: %f %%',
                            validation_class_accuracy * 100.)

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                            improvement_threshold:
                        patience = max(
                            patience, 10 * n_train_batches +
                            int(iter * patience_increase + 1))
                        logger.info("Patience increased to %d epochs",
                                    int(patience / n_train_batches))

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # save model parameters
                    best_params = [l.get_weights() for l in layers]
                    try_pickle_dump(best_params, weights_filename)

                    logger.info(('     epoch %i, minibatch %i/%i,'
                                 'validation error of best model %f %%') %
                                (epoch, minibatch_index + 1, n_train_batches,
                                 this_validation_loss * 100.))

            if patience <= iter:
                logger.info("Ran out of patience")
                done_looping = True
                break

        train_cost = numpy.mean(training_costs)
        logger.info('Average training cost %f', train_cost)
        if train_cost < best_train_cost * improvement_threshold:
            best_train_cost = train_cost
            best_epoch = epoch

        # lower learning rate if no improvement
        learn_rate = l_rate_wrapper.learning_rate.get_value()
        if learn_rate > min_learning_rate and\
                (epoch - best_epoch + 1) % epochs_check_learn_rate == 0:
            l_rate_wrapper.lower_rate_by_factor(0.5)

    logger.info('Optimization complete.')

    return best_validation_loss, best_iter, best_params
示例#12
0
def eval_model(conf, train_fn, test_fn, n_train_batches, n_test_batches,
               layers, pre_fn=None, l_rate_wrapper=None):
    """
    Function for trainig and validating models

    n_epochs: dictionary
        configuration params
    train_fn: theano function
        training function
    test_fn: theano function
        validation function
    n_train_batches: int
        number of batches for training
    n_test_batches: int
        number of batches for validation
    layers: list
        list of layers, used to extract best params
    pre_fn: function
        function to be called before training
    l_rate_wrapper: UpdateParameters object
        learning rate wrapper object

    returns: (best_validation_error, best_iter, best_params)
        the best validation error, iteration and parameters
    """
    assert(type(conf) is dict)
    n_epochs = conf['epochs']
    if n_epochs < 0:
        n_epochs = maxint

    # how often to lower learning rate if no improvement
    epochs_check_learn_rate = None
    if 'learning-rate-decrease-params' in conf:
        lrdp_params = conf['learning-rate-decrease-params']
        epochs_check_learn_rate = lrdp_params['no-improvement-epochs']
        min_learning_rate = lrdp_params['min-learning-rate']

    # file for dumping weights
    now = datetime.now()
    weights_filename = "network-%d-%d.bin" % (now.hour, now.minute)

    logger.info('... training')

    #   early-stopping parameters
    # look as this many iterations regardless
    patience = n_train_batches * 20  # skip first 20 epochs
    # wait this much longer when a new best is found
    patience_increase = 1.1
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.998
    # go through this many minibatche before checking the network
    # on the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2)
    logger.debug('Validation frequency is %d' % validation_frequency)

    set_layers_training_mode(layers, 1)

    best_validation_loss = numpy.inf
    best_iter = 0
    best_epoch = 0  # best epoch for train cost
    best_params = []
    best_train_cost = numpy.inf

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        # function to be called before every epoch
        if pre_fn is not None:
            pre_fn()

        training_costs = numpy.zeros((n_train_batches), dtype='float32')
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            cost_ij = train_fn(minibatch_index)
            training_costs[minibatch_index] = cost_ij
            # logger.info('training @ iter = %d, cost %f' % (iter, cost_ij))
            stdout.write('.')
            stdout.flush()

            if (iter + 1) % validation_frequency == 0:
                stdout.write('\n')  # newline after iteration dots

                set_layers_training_mode(layers, 0)

                # compute zero-one loss on validation set
                validation = [test_fn(i) for i in xrange(n_test_batches)]
                set_layers_training_mode(layers, 1)

                validation_losses = [v[0] for v in validation]
                validation_costs = [v[1] for v in validation]

                # class accuracies
                correct = numpy.zeros((layers[0].n_classes), dtype='int32')
                total = numpy.zeros((layers[0].n_classes), dtype='int32')
                for v in validation:
                    correct += v[2]
                    total += v[3]
                validation_class_accuracy = calc_class_accuracy(correct, total)

                this_validation_loss = numpy.mean(validation_losses)
                logger.info('epoch %i, minibatch %i/%i, validation error %f %%',
                            epoch, minibatch_index + 1, n_train_batches,
                            this_validation_loss * 100.)
                logger.info('validation cost: %f',
                            numpy.mean(validation_costs))
                logger.info('mean class accuracy: %f %%',
                            validation_class_accuracy * 100.)

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                            improvement_threshold:
                        patience = max(patience,
                                       10 * n_train_batches + int(iter * patience_increase + 1))
                        logger.info("Patience increased to %d epochs",
                                    int(patience / n_train_batches))

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # save model parameters
                    best_params = [l.get_weights() for l in layers]
                    try_pickle_dump(best_params, weights_filename)

                    logger.info(('     epoch %i, minibatch %i/%i,'
                                 'validation error of best model %f %%') %
                                (epoch, minibatch_index + 1, n_train_batches,
                                 this_validation_loss * 100.))

            if patience <= iter:
                logger.info("Ran out of patience")
                done_looping = True
                break

        train_cost = numpy.mean(training_costs)
        logger.info('Average training cost %f', train_cost)
        if train_cost < best_train_cost * improvement_threshold:
            best_train_cost = train_cost
            best_epoch = epoch

        # lower learning rate if no improvement
        learn_rate = l_rate_wrapper.learning_rate.get_value()
        if learn_rate > min_learning_rate and\
                (epoch - best_epoch + 1) % epochs_check_learn_rate == 0:
            l_rate_wrapper.lower_rate_by_factor(0.5)

    logger.info('Optimization complete.')

    return best_validation_loss, best_iter, best_params
示例#13
0
def load_ngrams(n,
                features_use,
                tree,
                subset=None,
                min_occ=1,
                min_files=1,
                remove_subst_tokens=False):
    """
    Loads the dataset for microsot sentence completion challenge, processed
    into ngrams.

    The raw dataset is loadaed and processed using the 'load' function,
    to which 'subset', 'min_occ' and 'min_files' are forwared.

    The resulting dataset is then processed into ngrams using the
    'ngrams' function, to which 'n' and 'tree' parameter are forwarded.
    This is then cached on the disk for subsequent usage.

    The resulting ngrams are pruned from unwanted features as indicated
    by the 'features_use parameter'.

    Returns a tuple (sents, q_groups, answers, feature_sizes).
    This reflects the returned value by the 'load' function, except that
    'sents' and 'g_groups' are now not just features extracted from text,
    but ngrams built from those features.
    """

    features_use = np.array(features_use, dtype=bool)

    log.info("Loading %d-grams, %s, features_use: %s", n,
             "tree" if tree else "linear",
             "".join([str(int(i)) for i in features_use]))

    dir = os.path.join("data", "processed")
    if not os.path.exists(dir):
        os.makedirs(dir)
    name_base = "%s-%d_grams-subset_%r-min_occ_%r-min_files_%r" % (
        "tree" if tree else "linear", n, subset, min_occ, min_files)

    #   tree-grams can all be seen as a feature-subset of 4 grams
    if tree and n < 4:
        ngrams_all = load_ngrams(4, np.ones(features_use.size,
                                            dtype=bool), tree, subset, min_occ,
                                 min_files, remove_subst_tokens)
    else:
        #   look for the cached 4-grams with all the features
        file_name = os.path.join(dir, name_base + ".pkl")
        ngrams_all = util.try_pickle_load(file_name)
        #   it is possible that sentences are split
        #   in order to avoid Python bug with storing large arrays
        if ngrams_all is not None and isinstance(ngrams_all[0], list):
            sents = np.vstack(ngrams_all[0])
            ngrams_all = (sents, ) + ngrams_all[1:]

    #   if unable to load cached data, create it
    if ngrams_all is None:
        #   load data
        tokens, q_groups, answers, ftr_sizes = load(subset, min_occ, min_files)

        #   tokens that should be present in ngrams
        #   the purpose is to remove ngrams containing tokens that are
        #   substitutes for removed ones
        invalid_tokens = None
        if remove_subst_tokens:
            invalid_tokens = dict(zip(range(3), ftr_sizes[:3] - 1))
            log.info("Invalid tokens: %r", invalid_tokens)

        #   define a function for generating ngrams, and process
        #   trainset and questions
        _ngrams = lambda tokens: ngrams(n, tree, tokens, invalid_tokens)
        sent_ngrams = _ngrams(tokens)
        q_ngrams = [map(_ngrams, qg) for qg in q_groups]

        #   store the processed data for subsequent usage
        #   split sent ngrams to avoid Py bug with pickling large arrays
        util.try_pickle_dump(
            (np.vsplit(sent_ngrams,
                       np.arange(1, 10) *
                       (len(sent_ngrams) / 10)), q_ngrams, answers, ftr_sizes),
            file_name)
        ngrams_all = (sent_ngrams, q_ngrams, answers, ftr_sizes)

    #   remove unwanted features from ngrams_all
    used_ftr = np.arange(ngrams_all[0].shape[1])[np.tile(features_use, n)]
    sents = ngrams_all[0][:, used_ftr]
    q_groups = [[q[:, used_ftr] for q in qg] for qg in ngrams_all[1]]

    return (sents, q_groups) + ngrams_all[2:]
def main(conf, gen_func, n_layers, show=False):
    """
    conf: dictionary
        configuration dictionary, from json file
    gen_func: function
        function used for generating inputs to network
    n_layers: int
        number of layers of laplacian pyramid used as input
    show: bool
        if true, few parsed images will be shown as a result
    """
    logger.info("... loading data")
    logger.debug("Theano.config.floatX is %s" % theano.config.floatX)

    #   samples is list of Sample objects
    dataset_path = conf["training"]["dataset-folder"]
    samples = load_dataset(dataset_path)
    samples = list(samples)

    if "data-subset" in conf["training"]:
        #   use only subset of data
        data_to_use = conf["training"]["data-subset"]
        logger.info("Using only subset of %d samples", data_to_use)
        samples = samples[:data_to_use]

    random.seed(conf["training"]["shuffle-seed"])
    random.shuffle(samples)

    out_folder = conf["training"]["out-folder"]

    #   if test data defined
    if "test-percent" in conf["training"]:
        logger.info("Found test configuration, generating test data")
        test_size = float(conf["training"]["test-percent"]) / 100.0
        samples, test_samples = split_samples(samples, test_size)

        write_samples_log(test_samples, os.path.join(out_folder, "samples_test.log"))
        x_test = generate_x(test_samples, n_layers, gen_func)
        y_test = generate_targets(test_samples)

        try_pickle_dump(x_test, os.path.join(out_folder, "x_test.bin"))
        try_pickle_dump(y_test, os.path.join(out_folder, "y_test.bin"))
    else:
        logger.info("No test set configuration present")

    validation_size = float(conf["training"]["validation-percent"]) / 100.0
    train_samples, validation_samples = split_samples(samples, validation_size)
    del samples

    write_samples_log(train_samples, os.path.join(out_folder, "samples_train.log"))
    write_samples_log(validation_samples, os.path.join(out_folder, "samples_validation.log"))

    x_train = generate_x(train_samples, n_layers, gen_func)
    x_validation = generate_x(validation_samples, n_layers, gen_func)
    y_train = generate_targets(train_samples)
    y_validation = generate_targets(validation_samples)
    del train_samples
    del validation_samples

    try_pickle_dump(x_train, os.path.join(out_folder, "x_train.bin"))
    try_pickle_dump(x_validation, os.path.join(out_folder, "x_validation.bin"))
    try_pickle_dump(y_train, os.path.join(out_folder, "y_train.bin"))
    try_pickle_dump(y_validation, os.path.join(out_folder, "y_validation.bin"))

    if show:
        #   show few parsed samples from train set
        n_imgs = 5
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 0 * n_imgs + j + 1)
            pylab.axis("off")
            pylab.imshow(x_train[0][j, 0, :, :])  # rgb
        for j in xrange(n_imgs):
            pylab.subplot(2, n_imgs, 1 * n_imgs + j + 1)
            pylab.gray()
            pylab.axis("off")
            pylab.imshow(y_train[j, :, :])
        pylab.show()