def main(): parser = build_parser() options = parser.parse_args() check_opts(options) train_path = options.train_path test_path = options.test_path optimizer = options.optimizer epochs = options.epochs batch_size = options.batch_size learning_rate = options.learning_rate word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_FILE) X_train, y_train = get_tweets(train_path) X_train_norm = normalize_tweets(X_train) max_len = get_max_len(X_train_norm) input_shape = (max_len,) X_train_indices = text_to_indices(X_train_norm) model = sentiment_model(input_shape, word_to_vec_map, word_to_index) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) print('Starting the training...') model.fit(X_train_indices, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, shuffle=True) print('Training finished! Now saving model in model directory...') if not os.path.exists('model'): os.makedirs('model') model.save('model/sentiment_model.h5') print('Model saved!')
def emojis_ordered_dataset(amount): data = [] target_multi = [] target_single = [] for i, single_tweet in enumerate(get_tweets()): if i >= amount: break [tweet, emojis, raw_tweet] = single_tweet data.append(linguistic_preprocess(tweet)) target_multi.append(set(emojis)) target_single.append(emojis[0]) return [data, target_multi, target_single]
def emojis_balanced_dataset(amount=None, lame_limit=5000, lame_min_classes=100): emoji_tweet_map = {} data = [] target = [] for i, single_tweet in enumerate(get_tweets()): ### 排除长度大于lame_limit的tweet if i >= lame_limit: break [tweet, emojis, raw_tweet] = single_tweet ### emoji与tweet的对应统计 first_emoji = emojis[0] if first_emoji in emoji_tweet_map: emoji_tweet_map[first_emoji].append(tweet) else: emoji_tweet_map[first_emoji] = [tweet] emoji_names_in_dataset = emoji_tweet_map.keys() emoji_name_count = [(e, len(emoji_tweet_map[e])) for e in emoji_names_in_dataset] print(emoji_name_count) ### 删掉出现率小的emoji for emoji_name, count in emoji_name_count: if count < lame_min_classes: del emoji_tweet_map[emoji_name] else: # should probably be random... emoji_tweet_map[emoji_name] = emoji_tweet_map[ emoji_name][:lame_min_classes] # emoji_tweet_map[emoji_name] = emoji_tweet_map[emoji_name] for emoji_name, tweets in emoji_tweet_map.items(): for tweet in tweets: data.append(linguistic_preprocess(tweet)) target.append(emoji_name) return [data, None, target]
def count_emojis(emojis): for emoji in emojis: if emoji in emoji_stats: emoji_stats[emoji] += 1 else: emoji_stats[emoji] = 1 def count_together_emojis(emojis): emojis_set = set(emojis) for emoji in emojis_set: if emoji in usually_together: for emoji_friend in (emojis_set-{emoji}): if emoji_friend in usually_together[emoji]: usually_together[emoji][emoji_friend] += 1 else: usually_together[emoji][emoji_friend] = 1 else: usually_together[emoji] = {} for i, single_tweet in enumerate(get_tweets()): if i >= MAX_TWEETS: break print("{}...".format(i)) tweet, emojis, raw_tweet = single_tweet count_emojis(emojis) count_together_emojis(emojis) print("tip, run with -i") print("`emoji_stats`, `usually_together`")