示例#1
0
文件: train_sst.py 项目: framr/ml
    random.seed(31415)
    np.random.seed(9265)
    word_vectors = np.concatenate((
        (np.random.rand(num_words, dim_vectors) - .5) / dim_vectors, 
        np.zeros((num_words, dim_vectors))), 
        axis=0
    )

    params['sgd']['step'] = 0.2
    params['sgd']['iterations'] = 40000
    params['sgd']['tolerance'] = 1e-48
    params['sgd']['anneal_every'] = 20000
    params['sgd']['anneal_factor'] = 0.5

    word_vectors0 = sgd(
        lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, params, neg_sampling_cost_and_gradient), 
        word_vectors, params, postprocessing=normalize_rows, use_saved=True, print_every=100, save_params_every=5000)

    # sanity check: cost at convergence should be around or below 10
    # sum the input and output word vectors
    word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:])

    print "\n=== For autograder ==="
    check_words = ["the", "a", "an", "movie", "ordinary", "but", "and"]
    check_idx = [tokens[word] for word in check_words]
    check_vecs = word_vectors[check_idx, :]
    print check_vecs


    # Visualize the word vectors you trained
示例#2
0
    dim_vectors = word_vectors.shape[1]

    dataset = StanfordSentiment()
    train_features, train_labels, words = get_data(dataset, word_vectors, dtype='train')

    weights = np.random.randn(dim_vectors, 5)  # D x NUM_LABELS array
    # We will do batch optimization
    params = AttrDict({
        'sgd' : {'batch_size': 50, 'step': 3.0, 'iterations': iterations, 'tolerance': 0,
            'anneal_every': 10000, 'anneal_factor': 0.5},
        'dataset' : {}
    })

    print "Starting SGD..."
 
    weights = sgd(lambda weights: softmax_wrapper(train_features, train_labels, weights, regularization),
        weights, params, postprocessing=None, use_saved=False, print_every=500, save_params_every=1000)

    _, _, pred = softmax_regression(train_features, train_labels, weights)
    print "Train precision (%%): %f" % precision(train_labels, pred)
    save_data(words, train_labels, pred, 'data_train.txt')
     
    print "Testing on dev dataset"
    dev_features, dev_labels, dev_words = get_data(dataset, word_vectors, dtype='dev')

    print dev_features.shape, weights.shape
    _, _, pred = softmax_regression(dev_features, dev_labels, weights)
    print "Dev precision (%%): %f" % precision(dev_labels, pred)
    save_data(dev_words, dev_labels, pred, 'data_dev.txt')


    test_features, test_labels, test_words = get_data(dataset, word_vectors, dtype='test')
示例#3
0
文件: test.py 项目: framr/ml
    # Context size
    context_size = 5

    print "Training word vectors"

    # Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)
    word_vectors = np.concatenate((
        (np.random.rand(num_words, dim_vectors) - 0.5) / dim_vectors, 
        np.zeros((num_words, dim_vectors))), 
        axis=0
    )
    word_vectors0 = sgd(
        lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, neg_sampling_cost_and_gradient), 
        word_vectors, 0.3, 40000, posprocessing=normalize_rows, use_saved=True, print_every=10, tolerance=1e-8)

    # sanity check: cost at convergence should be around or below 10

    # sum the input and output word vectors
    word_vectors = (word_vectors0[:num_words,:] + word_vectors0[num_words:,:])

    print "\n=== For autograder ==="
    check_words = ["the", "a", "an", "movie", "ordinary", "but", "and"]
    checkIdx = [tokens[word] for word in check_words]
    checkVecs = word_vectors[checkIdx, :]
    print checkVecs


    # In[ ]: