Пример #1
0
def tokenize_cleaned_tweets(tweets: Dict[str, List[str]],
                            create_tokenizer=False):

    if create_tokenizer is False:
        max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')
        tok = load_tokenizer()
        ret = {}
        for hashtag, tweets in tweets.items():
            padded_x, _ = encode_embed_docs(tweets,
                                            tok,
                                            max_tweets=max_tweet_word_count)
            ret[hashtag] = padded_x
            dest = 'hashtag_' + hashtag
            numpy.save(dest, padded_x)
        return ret
    else:
        all_tweets = tweets.values()
        flat_list = []
        for sublist in all_tweets:
            for item in sublist:
                flat_list.append(item)
        tok = tokenize(flat_list, verbose=False)
        temp_padded_x, _ = encode_embed_docs(flat_list, tok)
        max_tweet_word_count = len(temp_padded_x[0])
        save_pickle(max_tweet_word_count, 'max_tweet_word_count.pickle')
        save_tokenizer(tok)

        return tokenize_cleaned_tweets(tweets)
Пример #2
0
def learn_embedding_word(x, y):
    padded_x = x
    tokenizer = load_tokenizer()
    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')

    print('learning word...')
    e = EmbeddingWord(tokenizer, padded_x, None, max_tweet_word_count,
                      tokenizer.num_words, y)
    e.create_embedding()
    weights = e.get_weights()
    save_model_mat(weights, 'embedding_word')
    return weights
Пример #3
0
def learn_embedding_skip_gram(x, y, texts):
    padded_x = x
    tokenizer = load_tokenizer()
    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')

    print('learning skip_gram...')
    e = EmbeddingGensimSkipGram(tokenizer, padded_x, None,
                                max_tweet_word_count, tokenizer.num_words, y,
                                texts)
    e.create_embedding()
    weights = e.get_weights()
    save_model_mat(weights, 'embedding_skip_gram')
    return weights
Пример #4
0
import numpy
from sklearn.preprocessing import StandardScaler

from bix.twitter.base.utils import load_pickle

if __name__ == '__main__':
    print('loading saved state')

    word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_word.pickle')
    x_std = StandardScaler().fit_transform(word[0])
    cov = numpy.cov(x_std.T)
    ev, _ = numpy.linalg.eig(cov)
    print(f"eigenvalues: {list(reversed(sorted(ev)))}")

    word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_glove.pickle')
    x_std = StandardScaler().fit_transform(word[0])
    cov = numpy.cov(x_std.T)
    ev, _ = numpy.linalg.eig(cov)
    print(f"eigenvalues: {list(reversed(sorted(ev)))}")

    word = load_pickle('/home/jonas/Development/fh/hiwi_job/bix/bix/data/twitter/data/models/embedding_skip_gram.pickle')
    x_std = StandardScaler().fit_transform(word[0])
    cov = numpy.cov(x_std.T)
    ev, _ = numpy.linalg.eig(cov)
    print(f"eigenvalues: {list(reversed(sorted(ev)))}")


    #a = eig.dot(x_std.T)
    #print(f"a: {a}")

Пример #5
0
from keras import Model
from keras.engine.saving import load_model

from bix.twitter.base.utils import load_pickle
import numpy as np

model: Model = load_model('models/sentiment_conv_ep100.h5')
love = load_pickle('preprocessed/hashtag_love.pickle')
sad = load_pickle('preprocessed/hashtag_sad.pickle')
res_love = model.predict_on_batch(love)
res_love = [e[0] for e in res_love]
res_sad = model.predict_on_batch(sad)
res_sad = [e[0] for e in res_sad]

print(
    f"love - mean: {np.mean(res_love)}, avg: {np.average(res_love)}, acc: {np.average([round(e) for e in res_love])}"
)
print(
    f"sad - mean: {np.mean(res_sad)}, avg: {np.average(res_sad)}, acc: {np.average([round(e) for e in res_sad])}"
)
from sklearn.model_selection import train_test_split

from bix.twitter.base.utils import load_csv, encode_embed_docs, save_pickle, load_pickle, save_csv
from bix.twitter.learn.tokenizer.tokenizer_utils import load_tokenizer

if __name__ == '__main__':
    print('loading saved state')

    tokenizer = load_tokenizer('learn')
    x = load_csv('learn/tweets.csv')
    y = load_csv('learn/lables.csv')
    max_tweet_word_count = load_pickle(
        'tokenized/learn/max_tweet_word_count.pickle')

    print('reducing learning data')
    x_learn, _, y_learn, _ = train_test_split(
        x, y, test_size=0.995, random_state=4)  # 16k are more than enough

    print('encoding data')

    padded_x, unpadded_x = encode_embed_docs(x_learn, tokenizer,
                                             max_tweet_word_count)

    print('saving')

    save_pickle(padded_x, 'tokenized/learn/small_padded_x.pickle')
    save_pickle(unpadded_x, 'tokenized/learn/small_unpadded_x.pickle')
    save_pickle(y_learn, 'tokenized/learn/small_y.pickle')
    save_csv('learn/tweets_learn.csv', x_learn)
Пример #7
0
from bix.twitter.base.utils import load_training_sentiment_data_small, load_csv, load_pickle

if __name__ == '__main__':
    y = load_pickle('tokenized/learn/small_y.pickle')
    x = load_csv('learn/tweets_learn.csv')
    for i, v in enumerate(x):
        print(f"{y[i]}: {v}")
Пример #8
0
def train_model_convolutional(x, y, embedding_mats):
    #    print(device_lib.list_local_devices())
    #    exit(0)
    padded_x = x

    max_tweet_word_count = load_pickle('max_tweet_word_count.pickle')
    tok = load_tokenizer()
    vocab_size = tok.num_words

    #if 'test_all_data' in args:
    #    padded_x = load_pickle('tokenized/learn/padded_x.pickle')
    #    y = load_csv('learn/lables.csv')

    enc_y = np_utils.to_categorical(y)

    x_train, x_test, y_train, y_test = train_test_split(padded_x,
                                                        enc_y,
                                                        test_size=0.2,
                                                        random_state=5)

    #model_mat_word = load_model_mat('embedding_word')
    #print(f'model_mat_word.shape: {model_mat_word[0].shape}')
    #model_mat_glove = load_model_mat('embedding_glove')
    #print(f'model_mat_glove.shape: {model_mat_glove[0].shape}')
    #model_mat_skip_gram = load_model_mat('embedding_skip_gram')
    #print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}')
    # model_mat_skip_gram = load_model_mat('embedding_skip_gram')

    # vocab_size = len(tok.word_index) + 1

    # model = Sequential()
    # model.add(Embedding(vocab_size, model_mat_word[0].shape[1], input_length=max_tweet_word_count,
    #                    weights=model_mat_word))
    # model.add(Flatten())

    # model.add(Dense(1, activation='sigmoid'))

    # word
    input = Input(shape=(max_tweet_word_count, ))
    xs = []
    for mat in embedding_mats:
        xs.append(
            Embedding(vocab_size,
                      mat[0].shape[1],
                      input_length=max_tweet_word_count,
                      weights=mat,
                      trainable=False)(input))

    combined = concatenate(xs)
    z = Conv1D(100, 5, activation='relu')(combined)
    z = Conv1D(100, 5, activation='relu')(z)
    z = MaxPooling1D()(z)

    z = Conv1D(160, 5, activation='relu')(z)
    z = Conv1D(160, 5, activation='relu')(z)
    z = GlobalMaxPooling1D()(z)
    z = Dropout(0.5)(z)

    #f = Flatten()(z)
    # conv
    #polling
    #flatten
    #dense
    # (evntl. residual conv)
    #z = Dense(10, activation="relu")(z)
    z = Dense(len(y_test[0]), activation="softmax")(z)
    # ca 20kk total params
    model = Model(inputs=[input], outputs=z)

    # run_opts = tensorflow.RunOptions(report_tensor_allocations_upon_oom=True)
    # compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['acc'])  # , options=run_opts)
    # experiment optimizer (adam vs rmsprop)
    # expeniment activation function (liki_relu, elu)
    # summarize the model
    print(model.summary())
    # fit the model

    es = EarlyStopping(monitor='val_loss')
    model.fit([x_train],
              y_train,
              epochs=50,
              verbose=1,
              batch_size=8000,
              validation_split=0.1,
              callbacks=[es])
    # todo: use return value
    # evaluate the model
    loss, accuracy = model.evaluate([x_test],
                                    y_test,
                                    verbose=1,
                                    batch_size=8000)
    print('Accuracy: %f' % (accuracy * 100))
    # small: Accuracy: 93.041664
    # all data - Accuracy: 79.458750
    #3 embedding Layers:

    model.save('sentiment_conv_ep100.h5')

    print('finished')
Пример #9
0
from keras import Sequential, Input, Model
from keras.layers import Embedding, Flatten, Dense, concatenate
from sklearn.model_selection import train_test_split

from bix.twitter.base.utils import load_model_mat, load_training_sentiment_data_small, load_pickle, load_csv

if __name__ == '__main__':
    args = sys.argv
    #    print(device_lib.list_local_devices())
    #    exit(0)

    tok, y, padded_x, _, max_tweet_word_count, vocab_size = load_training_sentiment_data_small(
    )

    if 'test_all_data' in args:
        padded_x = load_pickle('tokenized/learn/padded_x.pickle')
        y = load_csv('learn/lables.csv')

    x_train, x_test, y_train, y_test = train_test_split(padded_x,
                                                        y,
                                                        test_size=0.60)

    model_mat_word = load_model_mat('embedding_word')
    print(f'model_mat_word.shape: {model_mat_word[0].shape}')
    model_mat_glove = load_model_mat('embedding_glove')
    print(f'model_mat_glove.shape: {model_mat_glove[0].shape}')
    model_mat_skip_gram = load_model_mat('embedding_skip_gram')
    print(f'model_mat_skip_gram.shape: {model_mat_skip_gram[0].shape}')
    # model_mat_skip_gram = load_model_mat('embedding_skip_gram')

    # vocab_size = len(tok.word_index) + 1