Пример #1
0
def get_sentiment_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    sentiment_file_path = os.path.join(ref_data_path, 'sentiment_data/sentiment_lexicon.txt')

    sentiment_dict = defaultdict(dict)

    ## Reading and storing Emoticon words
    with open(sentiment_file_path, 'r') as f:

        emoticon_data = f.readlines()

    ## Creating Sentiment Dictionary
    for line in emoticon_data:

        line_split = line.split('\t');

        emoticon = line_split[0]

        mean_variance = [float(line_split[1]), float(line_split[2])]

        sentiment_dict[emoticon] = mean_variance

    return sentiment_dict
Пример #2
0
def get_mpqa_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    mpqa_file_path = os.path.join(ref_data_path,'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff')

    sentiment_map = lc.get_sentiment_map()

    with open(mpqa_file_path,'r') as f:

        data = f.readlines()

    mpqa_dict = {}

    for line in data:

        elem_bag = line.strip('\n').split(' ')

        for elem in elem_bag:

            item = elem.split('=')

            if item[0] == 'word1':

                word = item[1]

            elif item[0] == 'priorpolarity':
                binary_output = [0]*len(sentiment_map)
                if sentiment_map.get(item[1]) is not None:
                    binary_output[sentiment_map.get(item[1])] = 1
                    mpqa_dict[word] = binary_output

    return mpqa_dict
Пример #3
0
def main():

    locations = lrf_config.get_locations()

    INTERMED_DATA_PATH = locations['INTERMED_DATA_PATH']

    intermedJsonPath = os.path.join(INTERMED_DATA_PATH, 'intermed_dict.json')

    tweets_classified_path = os.path.join(INTERMED_DATA_PATH,
                                          'tweets_classified.txt')

    refRiskCatPath = os.path.join(INTERMED_DATA_PATH,
                                  'risk_category_file.json')

    with open(intermedJsonPath, 'r') as f:

        intermed_data = json.load(f)

    with open(refRiskCatPath, 'r') as f:

        risk_data = json.load(f)

    ## reading data into the dictionaries again
    tweet_dict = dict(intermed_data['tweet_dict'])

    tweet_cmplt = dict(intermed_data['tweet_cmplt'])

    processTweets(tweet_dict,
                  risk_data,
                  tweet_cmplt,
                  tweets_classified_path,
                  vector_type='word_embeddings')

    print('DONE')
Пример #4
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH'] + 'sentiment_data'

    intermed_data_dir = locations['INTERMED_DATA_PATH']

    x_filename = 'tweets.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []

    more_features = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_meta_features = {}

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            postprocessed_tweet, microblogging_features, lexicon_features = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            tweet_meta_features[
                'microblogging_features'] = microblogging_features

            tweet_meta_features['lexicon_features'] = lexicon_features

            more_features.append(tweet_meta_features)

    # Write process tweet text to file
    with open(os.path.join(ref_data_dir, 'tweets_processed.txt'), 'w') as f:
        for tweet in tweets:
            f.write('%s\n' % tweet)

    # write additional tweet features to file
    with open(os.path.join(ref_data_dir, 'more_tweet_features.txt'),
              'w',
              encoding='utf-8') as f:
        f.write(json.dumps(more_features, ensure_ascii=False))

    print("Preprocessing is completed")
Пример #5
0
def get_news_data(folder_name,file_name):

    locations = lc.get_locations()

    NEWS_DATA_PATH = os.path.join(locations['REF_DATA_PATH'], folder_name+'/'+file_name)

    news_data = pd.read_csv(NEWS_DATA_PATH)

    news_data = news_data.drop(["Unnamed: 0"], axis=1).set_index('Unnamed: 0.1')

    news_data['category'] = prepare_data(news_data['category'],'y_data',list_type=int)

    news_data['text'] = prepare_data(news_data['text'],'x_data')

    news_data = news_data[news_data.text != 'None']

    news_data.to_dict('index')

    return news_data
Пример #6
0
def get_twitter_abbreviations_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    abbr_file_path = os.path.join(ref_data_path, 'twitter_slang/twitter_slang_data.txt')

    data = pd.read_csv(abbr_file_path,sep='|',names=['abbr','meaning'])
    data_len = len(data['abbr'])

    abbr_list = data['abbr'].values
    meaning_list = data['meaning'].values
    slang_dict = {}

    for ind in range(data_len):
        slang_dict[abbr_list[ind]] = meaning_list[ind]

    return slang_dict
Пример #7
0
def get_tweet_data(file_type,file_name):

    locations = lc.get_locations()
    if file_type=='json':

        TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'], file_name)

        tweet_data = pd.read_json(TWEETS_DATA_PATH, orient='records',convert_axes=False)

        tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'],'x_data')

        return  tweet_data

    elif file_type == 'txt':

        TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'], file_name)

        if file_name == 'tweets_classified.txt':

            tweet_data = pd.read_csv(TWEETS_DATA_PATH,sep='|',names=['tweet_id','class_pos','class_both','tweet_word_list','tweet_cmplt']).drop_duplicates().set_index('tweet_id')

        elif file_name == 'tweet_truth.txt':

            tweet_data = pd.read_csv(TWEETS_DATA_PATH, sep='|').drop_duplicates(subset='tweet_id').set_index('tweet_id')

            tweet_data['class_annotated'] = prepare_data(tweet_data['class_annotated'], 'y_data',list_type=str)

        tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'], 'x_data')

        tweet_data['tweet_word_list'] = prepare_data(tweet_data['tweet_word_list'], 'word_bag')

        tweet_data['class_pos'] = prepare_data(tweet_data['class_pos'], 'y_data')

        tweet_data['class_both'] = prepare_data(tweet_data['class_both'], 'y_data')

        tweet_data.to_dict('index')

        return tweet_data
Пример #8
0
def main():
    locations = lrf_config.get_locations()
    glove_data_dict = get_glove_dict(locations['INTERMED_DATA_PATH'] +
                                     'glove_key_subset.json')
Пример #9
0
import argparse
import os
import numpy as np
from lrf import utility, sentiment_ml_classifier
from sklearn.preprocessing import scale
import json
from scipy.sparse import coo_matrix
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from lrf import lrf_config
from scipy.sparse import coo_matrix, hstack
############################ GLOBAL Variables
########################## data paths

locations = lrf_config.get_locations()
data_dir = locations['REF_DATA_PATH'] + 'sentiment_data'
tweets_data = 'tweets_processed.txt'
labels_data = 'labels.txt'
more_tweet_data = 'more_tweet_features.txt'
MORE_FEAT_FLAG = True


######################## select_top_k_features(data,labels,n_components=1700)
def select_top_k_features(data, labels, n_components=1700):
    data = SelectKBest(chi2, k=n_components).fit_transform(data, labels)
    return data


############################### load_and_process
def load_and_process(data_file, label_file):
Пример #10
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH']

    x_filename = 'sentiment_data/tweets.txt'
    y_filename = 'sentiment_data/labels.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []
    microblog_features = []
    lexicon_features = []
    tweets_lst = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            tweets_lst.append(pre_process_lst(content))

            postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            microblog_features.append(microblogging_features)

            lexicon_features.append(mpqa_sentiment_score)

    lexicon_features = np.asarray(lexicon_features)
    microblog_features = np.asarray(microblog_features)

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst,
                                                      ngram_range=2)

    transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst)
    #
    # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2)
    #
    # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets)

    with open(os.path.join(ref_data_dir, y_filename)) as f:
        y_data = f.readlines()

    y_data = [y.strip('\n') for y in y_data]
    y_data = np.asarray(y_data)
    num_of_features = 50
    accuracy_in_each_turn = []
    while num_of_features <= 3000:
        X_new = SelectKBest(chi2, k=num_of_features).fit_transform(
            transformed_data_rahul, y_data)

        extended_features_1 = np.append(X_new.toarray(),
                                        lexicon_features,
                                        axis=1)
        extended_features_2 = np.append(extended_features_1,
                                        microblog_features,
                                        axis=1)

        sentiment_map = lrf_config.get_sentiment_map()
        inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()}

        X_data = X_new.toarray()

        kf = KFold(n_splits=5)
        kf.get_n_splits(X_data)
        train_list = []
        test_list = []

        for train_index, test_index in kf.split(X_data):
            X_train = X_data[train_index]
            Y_train = y_data[train_index]
            X_test = X_data[test_index]
            Y_test = y_data[test_index]

            Y_pred, train_acc, test_acc = classifier.classify(
                'svc',
                X_train=X_train,
                Y_train=Y_train,
                X_test=X_test,
                Y_test=Y_test,
                class_map=inv_sentiment_map,
                is_X_text=False)

            # print('_______________________________________________________')
            # print(train_acc)
            # print(test_acc)
            train_list.append(train_acc)
            test_list.append(test_acc)

        # print('Train_Acc : ',np.mean(train_acc))
        # print('Test_Acc : ', np.mean(test_acc))
        accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)])

    for elem in accuracy_in_each_turn:
        print(elem)
Пример #11
0
        def glove_classification(self, data_dict, keywords, keyword_type,
                                 glove_data_file, glove_key_file):

            print('ENTERED GLOVE_CLASSIFICATION')

            locations = lrf_config.get_locations()

            glove_data_dict = utility.get_glove_dict(
                locations['INTERMED_DATA_PATH'] + glove_data_file)

            glove_key_dict = utility.get_glove_dict(
                locations['INTERMED_DATA_PATH'] + glove_key_file)

            glove_crux_pos = utility.getWordToCategMap(keywords,
                                                       glove_key_dict, 'pos')

            pos_key_glove_arr = glove_crux_pos['key_glove_arr']

            inv_pos_key_index = glove_crux_pos['inv_key_index']

            pos_risk_dict = glove_crux_pos['risk_dict']

            if keyword_type == 'both':

                glove_crux_neg = utility.getWordToCategMap(
                    keywords, glove_key_dict, 'neg')

                neg_key_glove_arr = glove_crux_neg['key_glove_arr']

                inv_neg_key_index = glove_crux_neg['inv_key_index']

                neg_risk_dict = glove_crux_neg['risk_dict']

            pos_predictions = {}

            both_predictions = {}

            for id in data_dict.keys():

                data_lst = []

                for word in data_dict.get_value(id):

                    if word in glove_data_dict:

                        data_lst.append(glove_data_dict[word][0])

                ## Preparing Tweet Array
                data_arr = np.asarray(data_lst)

                if len(data_arr) != 0:

                    ## Calculating cosine similarity
                    pos_cos_similarity = cosine_similarity(
                        data_arr, pos_key_glove_arr)

                    pos_nearest_neighbors = np.argsort(pos_cos_similarity,
                                                       axis=1)[:, -10:]

                    pos_tweet_neighbors = [
                        item for sublist in pos_nearest_neighbors
                        for item in sublist
                    ]

                    membership_count = {}

                    membership_count_pos = utility.getMembershipCount(
                        pos_tweet_neighbors, inv_pos_key_index, pos_risk_dict,
                        membership_count)

                    v_pos = list(membership_count_pos.values())

                    k_pos = list(membership_count_pos.keys())

                    output_pos = k_pos[v_pos.index(max(v_pos))]

                    if keyword_type == 'both':

                        neg_cos_similarity = cosine_similarity(
                            data_arr, neg_key_glove_arr)

                        neg_nearest_neighbors = np.argsort(neg_cos_similarity,
                                                           axis=1)[:, :10]

                        neg_tweet_neighbors = [
                            item for sublist in neg_nearest_neighbors
                            for item in sublist
                        ]

                        membership_count_both = utility.getMembershipCount(
                            neg_tweet_neighbors, inv_neg_key_index,
                            neg_risk_dict, membership_count_pos.copy())
                        v_both = list(membership_count_both.values())

                        k_both = list(membership_count_both.keys())

                        output_both = k_both[v_both.index(max(v_both))]

                    pos_predictions[id] = [output_pos]

                    both_predictions[id] = [
                        output_both
                    ] if keyword_type == 'both' else None

            return pos_predictions, both_predictions