def tokenize_cleaned_tweets(tweets: Dict[str, List[str]], create_tokenizer=False): if create_tokenizer is False: max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') tok = load_tokenizer() ret = {} for hashtag, tweets in tweets.items(): padded_x, _ = encode_embed_docs(tweets, tok, max_tweets=max_tweet_word_count) ret[hashtag] = padded_x dest = 'hashtag_' + hashtag numpy.save(dest, padded_x) return ret else: all_tweets = tweets.values() flat_list = [] for sublist in all_tweets: for item in sublist: flat_list.append(item) tok = tokenize(flat_list, verbose=False) temp_padded_x, _ = encode_embed_docs(flat_list, tok) max_tweet_word_count = len(temp_padded_x[0]) save_pickle(max_tweet_word_count, 'max_tweet_word_count.pickle') save_tokenizer(tok) return tokenize_cleaned_tweets(tweets)
def learn_embedding_word(x, y): padded_x = x tokenizer = load_tokenizer() max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') print('learning word...') e = EmbeddingWord(tokenizer, padded_x, None, max_tweet_word_count, tokenizer.num_words, y) e.create_embedding() weights = e.get_weights() save_model_mat(weights, 'embedding_word') return weights
def learn_embedding_skip_gram(x, y, texts): padded_x = x tokenizer = load_tokenizer() max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') print('learning skip_gram...') e = EmbeddingGensimSkipGram(tokenizer, padded_x, None, max_tweet_word_count, tokenizer.num_words, y, texts) e.create_embedding() weights = e.get_weights() save_model_mat(weights, 'embedding_skip_gram') return weights
import numpy from sklearn.preprocessing import StandardScaler from bix.twitter.base.utils import load_pickle if __name__ == '__main__': print('loading saved state') word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_word.pickle') x_std = StandardScaler().fit_transform(word[0]) cov = numpy.cov(x_std.T) ev, _ = numpy.linalg.eig(cov) print(f"eigenvalues: {list(reversed(sorted(ev)))}") word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_glove.pickle') x_std = StandardScaler().fit_transform(word[0]) cov = numpy.cov(x_std.T) ev, _ = numpy.linalg.eig(cov) print(f"eigenvalues: {list(reversed(sorted(ev)))}") word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_skip_gram.pickle') x_std = StandardScaler().fit_transform(word[0]) cov = numpy.cov(x_std.T) ev, _ = numpy.linalg.eig(cov) print(f"eigenvalues: {list(reversed(sorted(ev)))}") #a = eig.dot(x_std.T) #print(f"a: {a}")
from keras import Model from keras.engine.saving import load_model from bix.twitter.base.utils import load_pickle import numpy as np model: Model = load_model('models/sentiment_conv_ep100.h5') love = load_pickle('preprocessed/hashtag_love.pickle') sad = load_pickle('preprocessed/hashtag_sad.pickle') res_love = model.predict_on_batch(love) res_love = [e[0] for e in res_love] res_sad = model.predict_on_batch(sad) res_sad = [e[0] for e in res_sad] print( f"love - mean: {np.mean(res_love)}, avg: {np.average(res_love)}, acc: {np.average([round(e) for e in res_love])}" ) print( f"sad - mean: {np.mean(res_sad)}, avg: {np.average(res_sad)}, acc: {np.average([round(e) for e in res_sad])}" )
from sklearn.model_selection import train_test_split from bix.twitter.base.utils import load_csv, encode_embed_docs, save_pickle, load_pickle, save_csv from bix.twitter.learn.tokenizer.tokenizer_utils import load_tokenizer if __name__ == '__main__': print('loading saved state') tokenizer = load_tokenizer('learn') x = load_csv('learn/tweets.csv') y = load_csv('learn/lables.csv') max_tweet_word_count = load_pickle( 'tokenized/learn/max_tweet_word_count.pickle') print('reducing learning data') x_learn, _, y_learn, _ = train_test_split( x, y, test_size=0.995, random_state=4) # 16k are more than enough print('encoding data') padded_x, unpadded_x = encode_embed_docs(x_learn, tokenizer, max_tweet_word_count) print('saving') save_pickle(padded_x, 'tokenized/learn/small_padded_x.pickle') save_pickle(unpadded_x, 'tokenized/learn/small_unpadded_x.pickle') save_pickle(y_learn, 'tokenized/learn/small_y.pickle') save_csv('learn/tweets_learn.csv', x_learn)
from bix.twitter.base.utils import load_training_sentiment_data_small, load_csv, load_pickle if __name__ == '__main__': y = load_pickle('tokenized/learn/small_y.pickle') x = load_csv('learn/tweets_learn.csv') for i, v in enumerate(x): print(f"{y[i]}: {v}")
def train_model_convolutional(x, y, embedding_mats): # print(device_lib.list_local_devices()) # exit(0) padded_x = x max_tweet_word_count = load_pickle('max_tweet_word_count.pickle') tok = load_tokenizer() vocab_size = tok.num_words #if 'test_all_data' in args: # padded_x = load_pickle('tokenized/learn/padded_x.pickle') # y = load_csv('learn/lables.csv') enc_y = np_utils.to_categorical(y) x_train, x_test, y_train, y_test = train_test_split(padded_x, enc_y, test_size=0.2, random_state=5) #model_mat_word = load_model_mat('embedding_word') #print(f'model_mat_word.shape: {model_mat_word[0].shape}') #model_mat_glove = load_model_mat('embedding_glove') #print(f'model_mat_glove.shape: {model_mat_glove[0].shape}') #model_mat_skip_gram = load_model_mat('embedding_skip_gram') #print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}') # model_mat_skip_gram = load_model_mat('embedding_skip_gram') # vocab_size = len(tok.word_index) + 1 # model = Sequential() # model.add(Embedding(vocab_size, model_mat_word[0].shape[1], input_length=max_tweet_word_count, # weights=model_mat_word)) # model.add(Flatten()) # model.add(Dense(1, activation='sigmoid')) # word input = Input(shape=(max_tweet_word_count, )) xs = [] for mat in embedding_mats: xs.append( Embedding(vocab_size, mat[0].shape[1], input_length=max_tweet_word_count, weights=mat, trainable=False)(input)) combined = concatenate(xs) z = Conv1D(100, 5, activation='relu')(combined) z = Conv1D(100, 5, activation='relu')(z) z = MaxPooling1D()(z) z = Conv1D(160, 5, activation='relu')(z) z = Conv1D(160, 5, activation='relu')(z) z = GlobalMaxPooling1D()(z) z = Dropout(0.5)(z) #f = Flatten()(z) # conv #polling #flatten #dense # (evntl. residual conv) #z = Dense(10, activation="relu")(z) z = Dense(len(y_test[0]), activation="softmax")(z) # ca 20kk total params model = Model(inputs=[input], outputs=z) # run_opts = tensorflow.RunOptions(report_tensor_allocations_upon_oom=True) # compile the model model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) # , options=run_opts) # experiment optimizer (adam vs rmsprop) # expeniment activation function (liki_relu, elu) # summarize the model print(model.summary()) # fit the model es = EarlyStopping(monitor='val_loss') model.fit([x_train], y_train, epochs=50, verbose=1, batch_size=8000, validation_split=0.1, callbacks=[es]) # todo: use return value # evaluate the model loss, accuracy = model.evaluate([x_test], y_test, verbose=1, batch_size=8000) print('Accuracy: %f' % (accuracy * 100)) # small: Accuracy: 93.041664 # all data - Accuracy: 79.458750 #3 embedding Layers: model.save('sentiment_conv_ep100.h5') print('finished')
from keras import Sequential, Input, Model from keras.layers import Embedding, Flatten, Dense, concatenate from sklearn.model_selection import train_test_split from bix.twitter.base.utils import load_model_mat, load_training_sentiment_data_small, load_pickle, load_csv if __name__ == '__main__': args = sys.argv # print(device_lib.list_local_devices()) # exit(0) tok, y, padded_x, _, max_tweet_word_count, vocab_size = load_training_sentiment_data_small( ) if 'test_all_data' in args: padded_x = load_pickle('tokenized/learn/padded_x.pickle') y = load_csv('learn/lables.csv') x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.60) model_mat_word = load_model_mat('embedding_word') print(f'model_mat_word.shape: {model_mat_word[0].shape}') model_mat_glove = load_model_mat('embedding_glove') print(f'model_mat_glove.shape: {model_mat_glove[0].shape}') model_mat_skip_gram = load_model_mat('embedding_skip_gram') print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}') # model_mat_skip_gram = load_model_mat('embedding_skip_gram') # vocab_size = len(tok.word_index) + 1