예제 #1
0
def main():
    parser = build_parser()
    options = parser.parse_args()
    check_opts(options)

    train_path = options.train_path
    test_path = options.test_path
    optimizer = options.optimizer
    epochs = options.epochs
    batch_size = options.batch_size
    learning_rate = options.learning_rate

    word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_FILE)
    X_train, y_train = get_tweets(train_path)
    X_train_norm = normalize_tweets(X_train)
    max_len = get_max_len(X_train_norm)
    input_shape = (max_len,)
    X_train_indices = text_to_indices(X_train_norm)

    model = sentiment_model(input_shape, word_to_vec_map, word_to_index)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print('Starting the training...')
	model.fit(X_train_indices, y_train, epochs=epochs, batch_size=batch_size, 
		validation_split=0.1, shuffle=True)
	print('Training finished! Now saving model in model directory...')
	if not os.path.exists('model'):
    	os.makedirs('model')
	model.save('model/sentiment_model.h5')
	print('Model saved!')
예제 #2
0
def emojis_ordered_dataset(amount):
    data = []
    target_multi = []
    target_single = []

    for i, single_tweet in enumerate(get_tweets()):
        if i >= amount:
            break
        [tweet, emojis, raw_tweet] = single_tweet

        data.append(linguistic_preprocess(tweet))
        target_multi.append(set(emojis))
        target_single.append(emojis[0])

    return [data, target_multi, target_single]
예제 #3
0
def emojis_balanced_dataset(amount=None,
                            lame_limit=5000,
                            lame_min_classes=100):
    emoji_tweet_map = {}
    data = []
    target = []

    for i, single_tweet in enumerate(get_tweets()):
        ### 排除长度大于lame_limit的tweet
        if i >= lame_limit:
            break
        [tweet, emojis, raw_tweet] = single_tweet

        ### emoji与tweet的对应统计
        first_emoji = emojis[0]
        if first_emoji in emoji_tweet_map:
            emoji_tweet_map[first_emoji].append(tweet)
        else:
            emoji_tweet_map[first_emoji] = [tweet]

    emoji_names_in_dataset = emoji_tweet_map.keys()
    emoji_name_count = [(e, len(emoji_tweet_map[e]))
                        for e in emoji_names_in_dataset]

    print(emoji_name_count)

    ### 删掉出现率小的emoji
    for emoji_name, count in emoji_name_count:
        if count < lame_min_classes:
            del emoji_tweet_map[emoji_name]
        else:
            # should probably be random...
            emoji_tweet_map[emoji_name] = emoji_tweet_map[
                emoji_name][:lame_min_classes]
            # emoji_tweet_map[emoji_name] = emoji_tweet_map[emoji_name]

    for emoji_name, tweets in emoji_tweet_map.items():
        for tweet in tweets:
            data.append(linguistic_preprocess(tweet))
            target.append(emoji_name)

    return [data, None, target]
예제 #4
0
def count_emojis(emojis):
    for emoji in emojis:
        if emoji in emoji_stats:
            emoji_stats[emoji] += 1
        else:
            emoji_stats[emoji] = 1

def count_together_emojis(emojis):
    emojis_set = set(emojis)
    for emoji in emojis_set:
        if emoji in usually_together:
            for emoji_friend in (emojis_set-{emoji}):
                if emoji_friend in usually_together[emoji]:
                    usually_together[emoji][emoji_friend] += 1
                else:
                    usually_together[emoji][emoji_friend] = 1
        else:
            usually_together[emoji] = {}

for i, single_tweet in enumerate(get_tweets()):
    if i >= MAX_TWEETS:
        break
    print("{}...".format(i))
    tweet, emojis, raw_tweet = single_tweet
    count_emojis(emojis)
    count_together_emojis(emojis)

print("tip, run with -i")
print("`emoji_stats`, `usually_together`")