def loadData(f_path):
    '''
        Given a path, loads a data set and puts it into a dataframe
    '''
    loaded_data_set = defaultdict(list)
    infile = open(f_path)
    for line in infile:
        line = line.decode('utf8', 'ignore').encode('ascii', 'ignore')
        try:
            items = line.split('\t')
            if len(items) > 3:
                tweet_id = items[0]
                user_id = items[1]
                text = string.lower(string.strip(items[-1]))
                class_ = items[2]

                senttokens = text.split()  #nltk.word_tokenize(_text)
                stemmed_text = ''
                for t in senttokens:
                    stemmed_text += ' ' + stemmer.stem(t)

                loaded_data_set['id'].append(tweet_id + '-' + user_id)
                loaded_data_set['synsets'].append(
                    FeatureExtractionUtilities.getSynsetString(text, None))
                loaded_data_set['clusters'].append(
                    FeatureExtractionUtilities.getclusterfeatures(text))
                loaded_data_set['text'].append(stemmed_text)
                loaded_data_set['unstemmed_text'].append(text)
                loaded_data_set['class'].append(class_)

        except UnicodeDecodeError:
            print 'please convert to correct encoding..'

    infile.close()
    return loaded_data_set
Пример #2
0
def generateSentementFeatures(x_text, max_length):
    sequence_length = max_length  #max(len(x) for x in x_text)
    stemmer = PorterStemmer()
    unstemmed_texts = []
    for line in x_text:
        # line = line.decode('utf8', 'ignore').encode('ascii', 'ignore')
        # try:
        # items = line.split('\t')
        # if len(items) > 3:
        # _drug = string.lower(string.strip(items[1]))
        # _text = string.lower(string.strip(items[-1]))
        # _class = int(string.strip(items[2]))
        senttokens = nltk.word_tokenize(line)
        stemmed_text = ''
        for t in senttokens:
            stemmed_text += ' ' + stemmer.stem(t)
        unstemmed_texts.append(line)

        # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
        # print 'synsets==>'+str(training_set['synsets'])
        # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
        # print 'clusters==>' + str(training_set['clusters'])
        # training_set['text'].append(stemmed_text)
        # training_set['class'].append(_class)
        # print 'class==>' + str(training_set['class'])
        # except UnicodeDecodeError:
        #     print 'please convert to correct encoding..'

    # infile.close()
    # a = np.zeros((len(x_text), 90), dtype=np.float64).tolist()
    # trained_data = np.concatenate((np.array(trained_data),a),axis=1)

    print 'Generating training set sentiment features .. '
    sentiments = FeatureExtractionUtilities.getsentimentfeatures(
        unstemmed_texts, sequence_length)
    return sentiments
Пример #3
0
        arousalScore = numpy.load("dump/" + data_set + "/arousal_feature_list.npy")
        dominanceScore = numpy.load("dump/" + data_set + "/dominance_feature_list.npy")
        # print sentimentScore.shape
    else:
        # adeExact = numpy.load("dump/" + data_set + "/ade-exact.npy")
        adeNoStopWords = numpy.load("dump/" + data_set + "/ade-no-stopwords.npy")

    # x_text, Y = data_helpers.load_data_and_labels("./data/rt-polaritydata/sts-gold.pos",
    #                                                 "./data/rt-polaritydata/sts-gold.neg")

    print Y.shape
else:
    # file = open("testfile.txt", "wb")
    x_text, Y = data_helpers.load_data_and_y_labels("./data/MR/rt-polarity.pos",
                                                    "./data/MR/rt-polarity.neg")
    FeatureExtractionUtilities.loadItems()

    max_document_length = 32

    s, negScore, posScore, adeScore, subjScore, pposScore, nnegScore, moreGoodScore, moreBadScore, lessBadScore, lessGoodScore, sentence_cluster, sentence_cluster2, wordLength, wordOrder = FeatureExtractionUtilities.generateSemVec(
        x_text, max_document_length, embed_size=1)

    s = numpy.array(s)
    negScore = numpy.array(negScore)
    posScore = numpy.array(posScore)
    adeScore = numpy.array(adeScore)
    subjScore = numpy.array(subjScore)
    pposScore = numpy.array(pposScore)
    nnegScore = numpy.array(nnegScore)
    moreGoodScore = numpy.array(moreGoodScore)
    moreBadScore = numpy.array(moreBadScore)
def loadFeatureExtractionModuleItems():
    '''
        Load the various feature extraction resources
    '''
    FeatureExtractionUtilities.loadItems()
    return loaded_data_set


if __name__ == '__main__':
    #LOAD THE FEATURE EXTRACTION RESOURCES
    loadFeatureExtractionModuleItems()

    #LOAD THE DATA -- *SAMPLE SCRIPT USES THE SAME DATA FOR TRAINING AND TESTING*
    data_set_filename = 'binary_downloaded.tsv'
    training_data = loadData(data_set_filename)
    testing_data = loadData(data_set_filename)

    #GENERATE THE TRAINING SET FEATURES
    print 'GENERATING TRAINING SET FEATURES.. '
    training_data[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            training_data['unstemmed_text'])
    training_data[
        'structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(
            training_data['unstemmed_text'])
    training_data[
        'adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(
            training_data['unstemmed_text'])
    training_data['topictexts'], training_data[
        'topics'] = FeatureExtractionUtilities.gettopicscores(
            training_data['text'])
    training_data['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(
        training_data['text'])

    #SCALE THE STRUCTURAL FEATURES
    scaler1 = preprocessing.StandardScaler().fit(
        training_data['structuralfeatures'])
Пример #6
0
def loadFeatureExtractionModuleItems():
    FeatureExtractionUtilities.loadItems()
Пример #7
0
                # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
                # print 'synsets==>'+str(training_set['synsets'])
                # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
                # print 'clusters==>' + str(training_set['clusters'])
                training_set['text'].append(stemmed_text)
                training_set['class'].append(_class)
                # print 'class==>' + str(training_set['class'])
        except UnicodeDecodeError:
            print 'please convert to correct encoding..'

    infile.close()

    print 'Generating training set sentiment features .. '
    training_set[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            unstemmed_texts)
    # print 'training_set[sentiments]'+str(training_set['sentiments'])
    # training_set['structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(unstemmed_texts)
    # print 'training_set[structuralfeatures]'+str(training_set['structuralfeatures'])
    # scaler1 = preprocessing.StandardScaler().fit( training_set['structuralfeatures'])
    # train_structural_features = scaler1.transform( training_set['structuralfeatures'])
    # training_set['adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(unstemmed_texts)
    # print 'adrlexicon==>' + str(training_set['adrlexicon'])
    # training_set['topictexts'],training_set['topics'] = FeatureExtractionUtilities.gettopicscores(training_set['text'])
    # print 'topictexts==>' + str(training_set['topictexts'])
    # training_set['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(training_set['text'])
    # print 'goodbad==>' + str(training_set['goodbad'])
    '''
        Initialize the vectorizers
    '''
    print 'Initialize the vectorizers..'
Пример #8
0
                    FeatureExtractionUtilities.sentiposscores[(string.lower(
                        str(pos_tags[i][0])), 'n')])
                score = 200 * posscore
                sentence_pos[i] = score

            if FeatureExtractionUtilities.sentinegscores.has_key(
                (string.lower(str(pos_tags[i][0])), 'n')):
                negscore = float(
                    FeatureExtractionUtilities.sentinegscores[(string.lower(
                        str(pos_tags[i][0])), 'n')])
                score = 100 * negscore
                sentence_neg[i] = score
    return (sentence_pos, sentence_neg)


FeatureExtractionUtilities.loadItems()

x_text, Y = load_semeval_and_y_labels(
    "data/SemEval2015-task10-test-B-input.txt")

pos_feature_list = []
neg_feature_list = []

p = Pool(4)
for (pos_list, neg_list) in p.map(generate_senti_features, x_text):
    pos_feature_list.append(pos_list)
    neg_feature_list.append(neg_list)

pos_feature_list = numpy.expand_dims(pos_feature_list, axis=2)
neg_feature_list = numpy.expand_dims(neg_feature_list, axis=2)