gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(cost=cost_function, data_provider=validation_data_provider, regularizer=regularizer))


    n_epochs = 1
    n_batches = train_data_provider.batches_per_epoch * n_epochs

    time_start = time.time()

    costs = []
    prev_weights = tweet_model.pack()
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 10 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()

            Y_hat = tweet_model.fprop(X_valid, meta=meta_valid)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # This is really slow:
            #grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, g: {}".format(
                batch_index,
                acc, costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())),
        CostMinimizationObjective(cost=cost_function,
                                  data_provider=validation_data_provider,
                                  regularizer=regularizer))

    n_epochs = 1
    n_batches = train_data_provider.batches_per_epoch * n_epochs

    time_start = time.time()

    costs = []
    prev_weights = tweet_model.pack()
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 10 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch(
            )

            Y_hat = tweet_model.fprop(X_valid, meta=meta_valid)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # This is really slow:
            #grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, g: {}".format(
                batch_index, acc, costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())), grad_check)
def optimize_and_save(model, alphabet, n_batches, data_file_name, chars_or_words, result_file_name):

    print result_file_name

    with gzip.open(data_file_name) as data_file:
        data = json.loads(data_file.read())
        X, Y = map(list, zip(*data))

        # shuffle
        combined = zip(X, Y)
        random.shuffle(combined)
        X, Y = map(list, zip(*combined))

        # map labels to something useful
        Y = [ [":)", ":("].index(y) for y in Y ]

    if chars_or_words == 'chars':
        X = [list(x) for x in X]
    elif chars_or_words == 'words':
        # replace unknowns with an unknown character
        tokenizer = WordPunctTokenizer()
        new_X = []
        for x in X:
            new_X.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
        X = new_X
    else:
        raise ValueError("I don't know what that means :(")


    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X[:-500],
        Y=Y[:-500],
        batch_size=50,
        padding='PADDING')

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-500:],
        Y=Y[-500:],
        batch_size=500,
        padding='PADDING')



    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=train_data_provider)

    update_rule = AdaGrad(
        gamma=0.05,
        model_template=model)

    regularizer = L2Regularizer(lamb=1e-4)

    optimizer = SGD(
        model=model,
        objective=objective,
        update_rule=update_rule,
        regularizer=regularizer)

    print model

    monitor_info = []
    iteration_info = []
    for batch_index, info in enumerate(optimizer):
        iteration_info.append(info)

        if batch_index % 10 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()

            Y_hat = model.fprop(X_valid, meta=meta_valid)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))
            prop_1 = np.argmax(Y_hat, axis=1).mean()

            monitor_info.append({
                'batch_index': batch_index,
                'acc': acc,
                'prop_1': prop_1,
            })

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}".format(
                batch_index,
                acc, info['cost'],
                prop_1,
                np.mean(np.abs(model.pack())))

        if batch_index == n_batches - 1:
            break

    result = {
        'model': model,
        'iteration_info': iteration_info,
        'monitor_info': monitor_info,
        }

    with open(result_file_name, 'w') as result_file:
        pickle.dump(result, result_file, protocol=-1)
def optimize_and_save(model, alphabet, n_batches, data_file_name,
                      chars_or_words, result_file_name):

    print result_file_name

    with gzip.open(data_file_name) as data_file:
        data = json.loads(data_file.read())
        X, Y = map(list, zip(*data))

        # shuffle
        combined = zip(X, Y)
        random.shuffle(combined)
        X, Y = map(list, zip(*combined))

        # map labels to something useful
        Y = [[":)", ":("].index(y) for y in Y]

    if chars_or_words == 'chars':
        X = [list(x) for x in X]
    elif chars_or_words == 'words':
        # replace unknowns with an unknown character
        tokenizer = WordPunctTokenizer()
        new_X = []
        for x in X:
            new_X.append([
                w if w in alphabet else 'UNKNOWN'
                for w in tokenizer.tokenize(x)
            ])
        X = new_X
    else:
        raise ValueError("I don't know what that means :(")

    train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500],
                                                            Y=Y[:-500],
                                                            batch_size=50,
                                                            padding='PADDING')

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-500:], Y=Y[-500:], batch_size=500, padding='PADDING')

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider)

    update_rule = AdaGrad(gamma=0.05, model_template=model)

    regularizer = L2Regularizer(lamb=1e-4)

    optimizer = SGD(model=model,
                    objective=objective,
                    update_rule=update_rule,
                    regularizer=regularizer)

    print model

    monitor_info = []
    iteration_info = []
    for batch_index, info in enumerate(optimizer):
        iteration_info.append(info)

        if batch_index % 10 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch(
            )

            Y_hat = model.fprop(X_valid, meta=meta_valid)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))
            prop_1 = np.argmax(Y_hat, axis=1).mean()

            monitor_info.append({
                'batch_index': batch_index,
                'acc': acc,
                'prop_1': prop_1,
            })

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}".format(
                batch_index, acc, info['cost'], prop_1,
                np.mean(np.abs(model.pack())))

        if batch_index == n_batches - 1:
            break

    result = {
        'model': model,
        'iteration_info': iteration_info,
        'monitor_info': monitor_info,
    }

    with open(result_file_name, 'w') as result_file:
        pickle.dump(result, result_file, protocol=-1)