def train(sc, batch_size, sequence_len, max_words, embedding_dim, training_split): print('Processing text dataset') texts = news20.get_news20() data_rdd = sc.parallelize(texts, 2) word_to_ic = analyze_texts(data_rdd) # Only take the top wc between [10, sequence_len] word_to_ic = dict(word_to_ic[10: max_words]) bword_to_ic = sc.broadcast(word_to_ic) w2v = news20.get_glove_w2v(dim=embedding_dim) filtered_w2v = {w: v for w, v in w2v.items() if w in word_to_ic} bfiltered_w2v = sc.broadcast(filtered_w2v) tokens_rdd = data_rdd.map(lambda (text, label): ([w for w in text_to_words(text) if w in bword_to_ic.value], label)) padded_tokens_rdd = tokens_rdd.map( lambda (tokens, label): (pad(tokens, "##", sequence_len), label)) vector_rdd = padded_tokens_rdd.map(lambda (tokens, label): ([to_vec(w, bfiltered_w2v.value, embedding_dim) for w in tokens], label)) sample_rdd = vector_rdd.map( lambda (vectors, label): to_sample(vectors, label, embedding_dim)) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1-training_split]) state = {"batchSize": batch_size, "learningRate": 0.01, "learningRateDecay": 0.0002} optimizer = Optimizer( model=build_model(news20.CLASS_NUM), training_rdd=train_rdd, criterion=ClassNLLCriterion(), end_trigger=MaxEpoch(2), batch_size=batch_size, optim_method="Adagrad", state=state) optimizer.setvalidation( batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(), val_method=["Top1Accuracy"] ) train_model = optimizer.optimize()
def train(sc, batch_size, sequence_len, max_words, embedding_dim, training_split): print('Processing text dataset') texts = news20.get_news20() data_rdd = sc.parallelize(texts, 2) word_to_ic = analyze_texts(data_rdd) # Only take the top wc between [10, sequence_len] word_to_ic = dict(word_to_ic[10: max_words]) bword_to_ic = sc.broadcast(word_to_ic) w2v = news20.get_glove_w2v(dim=embedding_dim) filtered_w2v = {w: v for w, v in w2v.items() if w in word_to_ic} bfiltered_w2v = sc.broadcast(filtered_w2v) tokens_rdd = data_rdd.map(lambda (text, label): ([w for w in text_to_words(text) if w in bword_to_ic.value], label)) padded_tokens_rdd = tokens_rdd.map( lambda (tokens, label): (pad(tokens, "##", sequence_len), label)) vector_rdd = padded_tokens_rdd.map(lambda (tokens, label): ([to_vec(w, bfiltered_w2v.value, embedding_dim) for w in tokens], label)) sample_rdd = vector_rdd.map( lambda (vectors, label): to_sample(vectors, label, embedding_dim)) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1-training_split]) state = {"learningRate": 0.01, "learningRateDecay": 0.0002} optimizer = Optimizer( model=build_model(news20.CLASS_NUM), training_rdd=train_rdd, criterion=ClassNLLCriterion(), end_trigger=MaxEpoch(max_epoch), batch_size=batch_size, optim_method="Adagrad", state=state) optimizer.setvalidation( batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(), val_method=["Top1Accuracy"] ) train_model = optimizer.optimize()
def train(sc, batch_size, sequence_len, max_words, embedding_dim, training_split): print('Processing text dataset') texts = news20.get_news20() data_rdd = sc.parallelize(texts, 2) word_to_ic = analyze_texts(data_rdd) # Only take the top wc between [10, sequence_len] word_to_ic = dict(word_to_ic[10:max_words]) bword_to_ic = sc.broadcast(word_to_ic) w2v = news20.get_glove_w2v(dim=embedding_dim) filtered_w2v = dict((w, v) for w, v in w2v.items() if w in word_to_ic) bfiltered_w2v = sc.broadcast(filtered_w2v) tokens_rdd = data_rdd.map(lambda text_label: ( [w for w in text_to_words(text_label[0]) if w in bword_to_ic.value], text_label[1])) padded_tokens_rdd = tokens_rdd.map(lambda tokens_label: (pad( tokens_label[0], "##", sequence_len), tokens_label[1])) vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([ to_vec(w, bfiltered_w2v.value, embedding_dim) for w in tokens_label[0] ], tokens_label[1])) sample_rdd = vector_rdd.map(lambda vectors_label: to_sample( vectors_label[0], vectors_label[1], embedding_dim)) train_rdd, val_rdd = sample_rdd.randomSplit( [training_split, 1 - training_split]) optimizer = Optimizer(model=build_model(news20.CLASS_NUM), training_rdd=train_rdd, criterion=ClassNLLCriterion(), end_trigger=MaxEpoch(max_epoch), batch_size=batch_size, optim_method=Adagrad(learningrate=0.01, learningrate_decay=0.0002)) optimizer.set_validation(batch_size=batch_size, val_rdd=val_rdd, trigger=EveryEpoch(), val_method=[Top1Accuracy()]) train_model = optimizer.optimize()
from optim.optimizer import * from util.common import * from util.common import Sample import util.common init_engine() #Prepare the Data batch_size = 128 embedding_dim = 50 sequence_len = 50 max_words = 1000 training_split = 0.8 #Load the data texts = news20.get_news20('/tmp/news20data') w2v = news20.get_glove_w2v(dim=embedding_dim) len(texts),len(w2v) ##Show the wordcloud rand_idx=rd.randrange(0, len(texts)) wordcloud = WordCloud(max_font_size=40,background_color="white").generate(texts[rand_idx][0]) print( "the newsgroup of the text is %d"%(texts[rand_idx][1])) plt.imshow(wordcloud) plt.axis("off"); ##Use spark to launch do word analysis data_rdd = sc.parallelize(texts, 2) # break the text corpus into tokens (words)