def load_data(): df = pd.read_csv('data/sentiment.csv') if os.path.exists('data/text_sentiment_w2vec.npy'): word_encodings = np.load('data/text_sentiment_w2vec.npy') embeddings_matrix = np.load('data/embeddings_matrix.npy') else: print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Fix negative verbs...') df = fix_negative_verbs(df) print('Encode tweets...') df, embeddings_matrix = get_word_encoding_and_embeddings(df, True) word_encodings = pad_sequences(df.encodings.values.tolist(), maxlen=150, padding='post') np.save('data/text_sentiment_w2vec', word_encodings) np.save('data/embeddings_matrix', embeddings_matrix) df[df.sentiment == 4] = 1 classes = df['sentiment'].values.tolist() c = np.unique(classes).tolist() return word_encodings, classes, len(c), embeddings_matrix
def load_data(): """ Loads tweets data for emotion classification :return: list of word embeddings for each tweet, list of classes, number of classes, embeddings matrix :rtype: (list(numpy.array), list(int), int, numpy.array) """ df = pd.read_csv('data/text_emotion.csv') if os.path.exists('data/text_emotion_w2vec.npy'): word_encodings = np.load('data/text_emotion_w2vec.npy') embeddings_matrix = np.load('data/embeddings_matrix2.npy') else: col_names = df.columns.values col_names[len(col_names) - 1] = 'tweet' df.columns = col_names print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Fix negative verbs...') df = fix_negative_verbs(df) print('Encode tweets...') df, embeddings_matrix = get_word_encoding_and_embeddings(df, True) word_encodings = pad_sequences(df.encodings.values.tolist(), maxlen=150, padding='post') np.save('data/text_emotion_w2vec', word_encodings) np.save('data/embeddings_matrix2', embeddings_matrix) classes = df['sentiment'].values.tolist() c = np.unique(classes).tolist() d = dict([(y, x) for x, y in enumerate(c)]) classes = np.array([d[x] for x in classes]) return word_encodings, classes, len(c), embeddings_matrix
def load_sentiment_data(): if os.path.exists('data_multi_class/train_lexicon.npy'): lexicon_features_train = np.load('data_multi_class/train_lexicon.npy') lexicon_features_val = np.load('data_multi_class/val_lexicon.npy') lexicon_features_test = np.load('data_multi_class/test_lexicon.npy') lexicon_matrix = np.load('data_multi_class/lexicon_matrix2.npy') else: df_train = pd.read_csv('data_multi_class/2018-E-c-En-train.txt', sep='\t') col_names = df_train.columns.values col_names[1] = 'tweet' df_train.columns = col_names df_val = pd.read_csv('data_multi_class/2018-E-c-En-dev.txt', sep='\t') col_names = df_val.columns.values col_names[1] = 'tweet' df_val.columns = col_names df_test = pd.read_csv('data_multi_class/2018-E-c-En-test-gold.txt', sep='\t') col_names = df_test.columns.values col_names[1] = 'tweet' df_test.columns = col_names for df in [df_train, df_val, df_test]: print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Lematize tweets...') df = get_lemmas(df) print('Lexicon encoding...') df_train, lexicon_matrix = get_lexicon_values( df_train, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv') lexicon_features_train = pad_sequences( df_train.lexicon.values.tolist(), maxlen=150, padding='post') df_val, lexicon_matrix = get_lexicon_values( df_val, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv') lexicon_features_val = pad_sequences(df_val.lexicon.values.tolist(), maxlen=150, padding='post') df_test, lexicon_matrix = get_lexicon_values( df_test, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv') lexicon_features_test = pad_sequences(df_test.lexicon.values.tolist(), maxlen=150, padding='post') np.save('data_multi_class/train_lexicon', lexicon_features_train) np.save('data_multi_class/val_lexicon', lexicon_features_val) np.save('data_multi_class/test_lexicon', lexicon_features_test) np.save('data_multi_class/lexicon_matrix2', lexicon_matrix) return lexicon_features_train, lexicon_features_val, lexicon_features_test, lexicon_matrix
def load_sentiment_data(): if os.path.exists('data/text_sentiment_lexicon.npy'): lexicon_features = np.load('data/text_sentiment_lexicon.npy') lexicon_matrix = np.load('data/lexicon_matrix.npy') else: df = pd.read_csv('data/sentiment.csv') print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Lematize tweets...') df = get_lemmas(df) print('Lexicon encoding...') df, lexicon_matrix = get_lexicon_values(df) lexicon_features = pad_sequences(df.lexicon.values.tolist(), maxlen=150, padding='post') np.save('data/text_sentiment_lexicon', lexicon_features) np.save('data/lexicon_matrix', lexicon_matrix) return lexicon_features, lexicon_matrix
def load_sentiment_data(): if os.path.exists('data/text_emotion_lexicon.npy'): lexicon_features = np.load('data/text_emotion_lexicon.npy') lexicon_matrix = np.load('data/lexicon_matrix2.npy') else: df = pd.read_csv('data/text_emotion.csv') col_names = df.columns.values col_names[len(col_names) - 1] = 'tweet' df.columns = col_names print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Lematize tweets...') df = get_lemmas(df) print('Lexicon encoding...') df, lexicon_matrix = get_lexicon_values(df, lexicon_type=2, lexicon_name='w2v-dp-BCC-Lex.csv') lexicon_features = pad_sequences(df.lexicon.values.tolist(), maxlen=150, padding='post') np.save('data/text_emotion_lexicon', lexicon_features) np.save('data/lexicon_matrix2', lexicon_matrix) return lexicon_features, lexicon_matrix
def load_dataset(split): df = pd.read_csv('data/text_emotion.csv') df.columns = ['id', 'class', 'author', 'tweet'] if os.path.exists('data_ml/text_emotion_features.npy'): X = np.load('data_ml/text_emotion_features.npy') else: print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Lematize tweets...') df = get_lemmas(df) lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0) lexicon.columns = ['word', 'valence', 'arousal'] path_to_jar = 'stanford_parser/stanford-parser.jar' path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar' valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon) df = valence_shifter.get_initial_valences(df) featured_dataset, vocab = generate_initial_features(df) X = featured_dataset['valences'].values.tolist()[:split] y = featured_dataset['class'].values.tolist()[:split] selected, mask = feature_selection(X, y, vocab) for index, row in featured_dataset.iterrows(): valences = np.array(row.valences[mask]) featured_dataset.set_value(index=index, col='valences', value=valences) X = np.vstack(featured_dataset.valences.values) np.save('data_ml/text_emotion_features', X) classes = df['class'].values.tolist() c = np.unique(classes).tolist() d = dict([(y, x) for x, y in enumerate(c)]) classes = np.array([d[x] for x in classes]) return X, classes, len(c)
def load_data(): """ Loads tweets data for multi-label emotion classification """ df_train = pd.read_csv('data_multi_class/2018-E-c-En-train.txt', sep='\t') col_names = df_train.columns.values col_names[1] = 'tweet' df_train.columns = col_names df_val = pd.read_csv('data_multi_class/2018-E-c-En-dev.txt', sep='\t') col_names = df_val.columns.values col_names[1] = 'tweet' df_val.columns = col_names df_test = pd.read_csv('data_multi_class/2018-E-c-En-test-gold.txt', sep='\t') col_names = df_test.columns.values col_names[1] = 'tweet' df_test.columns = col_names if os.path.exists('data_multi_class/train_w2vec.npy'): word_encodings_train = np.load('data_multi_class/train_w2vec.npy') word_encodings_val = np.load('data_multi_class/val_w2vec.npy') word_encodings_test = np.load('data_multi_class/test_w2vec.npy') embeddings_matrix = np.load('data_multi_class/embeddings_matrix2.npy') else: for df in [df_train, df_val, df_test]: print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Fix negative verbs...') df = fix_negative_verbs(df) print('Encode tweets...') df_train, embeddings_matrix = get_word_encoding_and_embeddings( df_train, True) word_encodings_train = pad_sequences( df_train.encodings.values.tolist(), maxlen=150, padding='post') print('Encode tweets...') df_val, embeddings_matrix = get_word_encoding_and_embeddings( df_val, True) word_encodings_val = pad_sequences(df_val.encodings.values.tolist(), maxlen=150, padding='post') print('Encode tweets...') df_test, embeddings_matrix = get_word_encoding_and_embeddings( df_test, True) word_encodings_test = pad_sequences(df_test.encodings.values.tolist(), maxlen=150, padding='post') np.save('data_multi_class/train_w2vec', word_encodings_train) np.save('data_multi_class/val_w2vec', word_encodings_val) np.save('data_multi_class/test_w2vec', word_encodings_test) np.save('data_multi_class/embeddings_matrix2', embeddings_matrix) classes = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust' ] y_train = df_train[classes].values y_val = df_val[classes].values y_test = df_test[classes].values return word_encodings_train, y_train, word_encodings_val, y_val, word_encodings_test, y_test, embeddings_matrix