def load_data():
    df = pd.read_csv('data/sentiment.csv')

    if os.path.exists('data/text_sentiment_w2vec.npy'):
        word_encodings = np.load('data/text_sentiment_w2vec.npy')
        embeddings_matrix = np.load('data/embeddings_matrix.npy')
    else:
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Fix negative verbs...')
        df = fix_negative_verbs(df)
        print('Encode tweets...')
        df, embeddings_matrix = get_word_encoding_and_embeddings(df, True)
        word_encodings = pad_sequences(df.encodings.values.tolist(),
                                       maxlen=150,
                                       padding='post')
        np.save('data/text_sentiment_w2vec', word_encodings)
        np.save('data/embeddings_matrix', embeddings_matrix)

    df[df.sentiment == 4] = 1
    classes = df['sentiment'].values.tolist()
    c = np.unique(classes).tolist()

    return word_encodings, classes, len(c), embeddings_matrix
def load_data():
    """ Loads tweets data for emotion classification

    :return: list of word embeddings for each tweet, list of classes, number of classes, embeddings matrix
    :rtype: (list(numpy.array), list(int), int, numpy.array)
    """
    df = pd.read_csv('data/text_emotion.csv')

    if os.path.exists('data/text_emotion_w2vec.npy'):
        word_encodings = np.load('data/text_emotion_w2vec.npy')
        embeddings_matrix = np.load('data/embeddings_matrix2.npy')
    else:
        col_names = df.columns.values
        col_names[len(col_names) - 1] = 'tweet'
        df.columns = col_names
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Fix negative verbs...')
        df = fix_negative_verbs(df)
        print('Encode tweets...')
        df, embeddings_matrix = get_word_encoding_and_embeddings(df, True)
        word_encodings = pad_sequences(df.encodings.values.tolist(), maxlen=150, padding='post')
        np.save('data/text_emotion_w2vec', word_encodings)
        np.save('data/embeddings_matrix2', embeddings_matrix)

    classes = df['sentiment'].values.tolist()
    c = np.unique(classes).tolist()
    d = dict([(y, x) for x, y in enumerate(c)])
    classes = np.array([d[x] for x in classes])

    return word_encodings, classes, len(c), embeddings_matrix
예제 #3
0
def load_sentiment_data():
    if os.path.exists('data_multi_class/train_lexicon.npy'):
        lexicon_features_train = np.load('data_multi_class/train_lexicon.npy')
        lexicon_features_val = np.load('data_multi_class/val_lexicon.npy')
        lexicon_features_test = np.load('data_multi_class/test_lexicon.npy')
        lexicon_matrix = np.load('data_multi_class/lexicon_matrix2.npy')
    else:
        df_train = pd.read_csv('data_multi_class/2018-E-c-En-train.txt',
                               sep='\t')
        col_names = df_train.columns.values
        col_names[1] = 'tweet'
        df_train.columns = col_names
        df_val = pd.read_csv('data_multi_class/2018-E-c-En-dev.txt', sep='\t')
        col_names = df_val.columns.values
        col_names[1] = 'tweet'
        df_val.columns = col_names
        df_test = pd.read_csv('data_multi_class/2018-E-c-En-test-gold.txt',
                              sep='\t')
        col_names = df_test.columns.values
        col_names[1] = 'tweet'
        df_test.columns = col_names

        for df in [df_train, df_val, df_test]:
            print('Fix encoding...')
            df = fix_encoding(df)
            print('Split sentences...')
            df = split_tweet_sentences(df)
            print('Tokenize tweets...')
            df = tokenize_tweets(df)
            print('Lematize tweets...')
            df = get_lemmas(df)

        print('Lexicon encoding...')
        df_train, lexicon_matrix = get_lexicon_values(
            df_train, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv')
        lexicon_features_train = pad_sequences(
            df_train.lexicon.values.tolist(), maxlen=150, padding='post')
        df_val, lexicon_matrix = get_lexicon_values(
            df_val, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv')
        lexicon_features_val = pad_sequences(df_val.lexicon.values.tolist(),
                                             maxlen=150,
                                             padding='post')
        df_test, lexicon_matrix = get_lexicon_values(
            df_test, lexicon_type=2, lexicon_name='w2v-dp-CC-Lex.csv')
        lexicon_features_test = pad_sequences(df_test.lexicon.values.tolist(),
                                              maxlen=150,
                                              padding='post')
        np.save('data_multi_class/train_lexicon', lexicon_features_train)
        np.save('data_multi_class/val_lexicon', lexicon_features_val)
        np.save('data_multi_class/test_lexicon', lexicon_features_test)
        np.save('data_multi_class/lexicon_matrix2', lexicon_matrix)
    return lexicon_features_train, lexicon_features_val, lexicon_features_test, lexicon_matrix
def load_sentiment_data():
    if os.path.exists('data/text_sentiment_lexicon.npy'):
        lexicon_features = np.load('data/text_sentiment_lexicon.npy')
        lexicon_matrix = np.load('data/lexicon_matrix.npy')
    else:
        df = pd.read_csv('data/sentiment.csv')
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Lematize tweets...')
        df = get_lemmas(df)
        print('Lexicon encoding...')
        df, lexicon_matrix = get_lexicon_values(df)
        lexicon_features = pad_sequences(df.lexicon.values.tolist(),
                                         maxlen=150,
                                         padding='post')
        np.save('data/text_sentiment_lexicon', lexicon_features)
        np.save('data/lexicon_matrix', lexicon_matrix)
    return lexicon_features, lexicon_matrix
def load_sentiment_data():
    if os.path.exists('data/text_emotion_lexicon.npy'):
        lexicon_features = np.load('data/text_emotion_lexicon.npy')
        lexicon_matrix = np.load('data/lexicon_matrix2.npy')
    else:
        df = pd.read_csv('data/text_emotion.csv')
        col_names = df.columns.values
        col_names[len(col_names) - 1] = 'tweet'
        df.columns = col_names
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Lematize tweets...')
        df = get_lemmas(df)
        print('Lexicon encoding...')
        df, lexicon_matrix = get_lexicon_values(df, lexicon_type=2, lexicon_name='w2v-dp-BCC-Lex.csv')
        lexicon_features = pad_sequences(df.lexicon.values.tolist(), maxlen=150, padding='post')
        np.save('data/text_emotion_lexicon', lexicon_features)
        np.save('data/lexicon_matrix2', lexicon_matrix)
    return lexicon_features, lexicon_matrix
예제 #6
0
def load_dataset(split):
    df = pd.read_csv('data/text_emotion.csv')
    df.columns = ['id', 'class', 'author', 'tweet']

    if os.path.exists('data_ml/text_emotion_features.npy'):
        X = np.load('data_ml/text_emotion_features.npy')
    else:
        print('Fix encoding...')
        df = fix_encoding(df)
        print('Split sentences...')
        df = split_tweet_sentences(df)
        print('Tokenize tweets...')
        df = tokenize_tweets(df)
        print('Lematize tweets...')
        df = get_lemmas(df)
        lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0)
        lexicon.columns = ['word', 'valence', 'arousal']
        path_to_jar = 'stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar'
        valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon)
        df = valence_shifter.get_initial_valences(df)
        featured_dataset, vocab = generate_initial_features(df)
        X = featured_dataset['valences'].values.tolist()[:split]
        y = featured_dataset['class'].values.tolist()[:split]
        selected, mask = feature_selection(X, y, vocab)
        for index, row in featured_dataset.iterrows():
            valences = np.array(row.valences[mask])
            featured_dataset.set_value(index=index, col='valences', value=valences)
        X = np.vstack(featured_dataset.valences.values)
        np.save('data_ml/text_emotion_features', X)

    classes = df['class'].values.tolist()
    c = np.unique(classes).tolist()
    d = dict([(y, x) for x, y in enumerate(c)])
    classes = np.array([d[x] for x in classes])

    return X, classes, len(c)
예제 #7
0
def load_data():
    """ Loads tweets data for multi-label emotion classification
    """
    df_train = pd.read_csv('data_multi_class/2018-E-c-En-train.txt', sep='\t')
    col_names = df_train.columns.values
    col_names[1] = 'tweet'
    df_train.columns = col_names
    df_val = pd.read_csv('data_multi_class/2018-E-c-En-dev.txt', sep='\t')
    col_names = df_val.columns.values
    col_names[1] = 'tweet'
    df_val.columns = col_names
    df_test = pd.read_csv('data_multi_class/2018-E-c-En-test-gold.txt',
                          sep='\t')
    col_names = df_test.columns.values
    col_names[1] = 'tweet'
    df_test.columns = col_names

    if os.path.exists('data_multi_class/train_w2vec.npy'):
        word_encodings_train = np.load('data_multi_class/train_w2vec.npy')
        word_encodings_val = np.load('data_multi_class/val_w2vec.npy')
        word_encodings_test = np.load('data_multi_class/test_w2vec.npy')
        embeddings_matrix = np.load('data_multi_class/embeddings_matrix2.npy')
    else:
        for df in [df_train, df_val, df_test]:
            print('Fix encoding...')
            df = fix_encoding(df)
            print('Split sentences...')
            df = split_tweet_sentences(df)
            print('Tokenize tweets...')
            df = tokenize_tweets(df)
            print('Fix negative verbs...')
            df = fix_negative_verbs(df)
        print('Encode tweets...')
        df_train, embeddings_matrix = get_word_encoding_and_embeddings(
            df_train, True)
        word_encodings_train = pad_sequences(
            df_train.encodings.values.tolist(), maxlen=150, padding='post')
        print('Encode tweets...')
        df_val, embeddings_matrix = get_word_encoding_and_embeddings(
            df_val, True)
        word_encodings_val = pad_sequences(df_val.encodings.values.tolist(),
                                           maxlen=150,
                                           padding='post')
        print('Encode tweets...')
        df_test, embeddings_matrix = get_word_encoding_and_embeddings(
            df_test, True)
        word_encodings_test = pad_sequences(df_test.encodings.values.tolist(),
                                            maxlen=150,
                                            padding='post')
        np.save('data_multi_class/train_w2vec', word_encodings_train)
        np.save('data_multi_class/val_w2vec', word_encodings_val)
        np.save('data_multi_class/test_w2vec', word_encodings_test)
        np.save('data_multi_class/embeddings_matrix2', embeddings_matrix)

    classes = [
        'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism',
        'pessimism', 'sadness', 'surprise', 'trust'
    ]
    y_train = df_train[classes].values
    y_val = df_val[classes].values
    y_test = df_test[classes].values

    return word_encodings_train, y_train, word_encodings_val, y_val, word_encodings_test, y_test, embeddings_matrix