예제 #1
0
def train(sc,
          batch_size,
          sequence_len, max_words, embedding_dim, training_split):
    print('Processing text dataset')
    texts = news20.get_news20()
    data_rdd = sc.parallelize(texts, 2)

    word_to_ic = analyze_texts(data_rdd)

    # Only take the top wc between [10, sequence_len]
    word_to_ic = dict(word_to_ic[10: max_words])
    bword_to_ic = sc.broadcast(word_to_ic)

    w2v = news20.get_glove_w2v(dim=embedding_dim)
    filtered_w2v = {w: v for w, v in w2v.items() if w in word_to_ic}
    bfiltered_w2v = sc.broadcast(filtered_w2v)

    tokens_rdd = data_rdd.map(lambda (text, label):
                              ([w for w in text_to_words(text) if
                                w in bword_to_ic.value], label))
    padded_tokens_rdd = tokens_rdd.map(
        lambda (tokens, label): (pad(tokens, "##", sequence_len), label))
    vector_rdd = padded_tokens_rdd.map(lambda (tokens, label):
                                       ([to_vec(w, bfiltered_w2v.value,
                                                embedding_dim) for w in
                                         tokens], label))
    sample_rdd = vector_rdd.map(
        lambda (vectors, label): to_sample(vectors, label, embedding_dim))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1-training_split])

    state = {"batchSize": batch_size,
             "learningRate": 0.01,
             "learningRateDecay": 0.0002}

    optimizer = Optimizer(
        model=build_model(news20.CLASS_NUM),
        training_rdd=train_rdd,
        criterion=ClassNLLCriterion(),
        end_trigger=MaxEpoch(2),
        batch_size=batch_size,
        optim_method="Adagrad",
        state=state)

    optimizer.setvalidation(
        batch_size=batch_size,
        val_rdd=val_rdd,
        trigger=EveryEpoch(),
        val_method=["Top1Accuracy"]
    )
    train_model = optimizer.optimize()
예제 #2
0
def train(sc,
          batch_size,
          sequence_len, max_words, embedding_dim, training_split):
    print('Processing text dataset')
    texts = news20.get_news20()
    data_rdd = sc.parallelize(texts, 2)

    word_to_ic = analyze_texts(data_rdd)

    # Only take the top wc between [10, sequence_len]
    word_to_ic = dict(word_to_ic[10: max_words])
    bword_to_ic = sc.broadcast(word_to_ic)

    w2v = news20.get_glove_w2v(dim=embedding_dim)
    filtered_w2v = {w: v for w, v in w2v.items() if w in word_to_ic}
    bfiltered_w2v = sc.broadcast(filtered_w2v)

    tokens_rdd = data_rdd.map(lambda (text, label):
                              ([w for w in text_to_words(text) if
                                w in bword_to_ic.value], label))
    padded_tokens_rdd = tokens_rdd.map(
        lambda (tokens, label): (pad(tokens, "##", sequence_len), label))
    vector_rdd = padded_tokens_rdd.map(lambda (tokens, label):
                                       ([to_vec(w, bfiltered_w2v.value,
                                                embedding_dim) for w in
                                         tokens], label))
    sample_rdd = vector_rdd.map(
        lambda (vectors, label): to_sample(vectors, label, embedding_dim))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1-training_split])

    state = {"learningRate": 0.01,
             "learningRateDecay": 0.0002}

    optimizer = Optimizer(
        model=build_model(news20.CLASS_NUM),
        training_rdd=train_rdd,
        criterion=ClassNLLCriterion(),
        end_trigger=MaxEpoch(max_epoch),
        batch_size=batch_size,
        optim_method="Adagrad",
        state=state)

    optimizer.setvalidation(
        batch_size=batch_size,
        val_rdd=val_rdd,
        trigger=EveryEpoch(),
        val_method=["Top1Accuracy"]
    )
    train_model = optimizer.optimize()
예제 #3
0
def train(sc, batch_size, sequence_len, max_words, embedding_dim,
          training_split):
    print('Processing text dataset')
    texts = news20.get_news20()
    data_rdd = sc.parallelize(texts, 2)

    word_to_ic = analyze_texts(data_rdd)

    # Only take the top wc between [10, sequence_len]
    word_to_ic = dict(word_to_ic[10:max_words])
    bword_to_ic = sc.broadcast(word_to_ic)

    w2v = news20.get_glove_w2v(dim=embedding_dim)
    filtered_w2v = dict((w, v) for w, v in w2v.items() if w in word_to_ic)
    bfiltered_w2v = sc.broadcast(filtered_w2v)

    tokens_rdd = data_rdd.map(lambda text_label: (
        [w for w in text_to_words(text_label[0])
         if w in bword_to_ic.value], text_label[1]))
    padded_tokens_rdd = tokens_rdd.map(lambda tokens_label: (pad(
        tokens_label[0], "##", sequence_len), tokens_label[1]))
    vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([
        to_vec(w, bfiltered_w2v.value, embedding_dim) for w in tokens_label[0]
    ], tokens_label[1]))
    sample_rdd = vector_rdd.map(lambda vectors_label: to_sample(
        vectors_label[0], vectors_label[1], embedding_dim))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    optimizer = Optimizer(model=build_model(news20.CLASS_NUM),
                          training_rdd=train_rdd,
                          criterion=ClassNLLCriterion(),
                          end_trigger=MaxEpoch(max_epoch),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=0.01,
                                               learningrate_decay=0.0002))

    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
                             trigger=EveryEpoch(),
                             val_method=[Top1Accuracy()])
    train_model = optimizer.optimize()
예제 #4
0
from optim.optimizer import *
from util.common import *
from util.common import Sample
import util.common

init_engine()

#Prepare the Data
batch_size = 128
embedding_dim = 50
sequence_len = 50
max_words = 1000 
training_split = 0.8

#Load the data
texts = news20.get_news20('/tmp/news20data')
w2v = news20.get_glove_w2v(dim=embedding_dim)
len(texts),len(w2v)

##Show the wordcloud
rand_idx=rd.randrange(0, len(texts))
wordcloud = WordCloud(max_font_size=40,background_color="white").generate(texts[rand_idx][0])
print( "the newsgroup of the text is %d"%(texts[rand_idx][1]))
plt.imshow(wordcloud)
plt.axis("off");


##Use spark to launch do word analysis
data_rdd = sc.parallelize(texts, 2)

# break the text corpus into tokens (words)