Пример #1
0
def main():
    text = read_file_2(3)
    review_tokens = [get_words(asdf) for asdf in text]
    stopped_sent = [stopword_rem(sentence) for sentence in review_tokens]

    sents = []
    for i in stopped_sent:
        asdf = ''
        for j in i:
            asdf = asdf + j + ' '
        sents.append(asdf)

    sid = SentimentIntensityAnalyzer()
    sentiment_scores = [sid.polarity_scores(sent) for sent in sents]
    bla = [[asdf['pos'], -1 * asdf['neg']] for asdf in sentiment_scores]
    bla.sort()
    qwer = range(len(bla))
    data = [go.Surface(z=bla, x=qwer, y=qwer)]
    layout = go.Layout(title='Sentiment Analysis',
                       autosize=False,
                       width=500,
                       height=500,
                       margin=dict(l=65, r=50, b=65, t=90))
    fig = go.Figure(data=data, layout=layout)
    py.plot(fig, filename='Sentiment Analysis')
Пример #2
0
def demo_vader_instance(text):
    """
    Output polarity scores for a text using Vader approach.

    :param text: a text whose polarity has to be evaluated.
    """
    from vader import SentimentIntensityAnalyzer
    vader_analyzer = SentimentIntensityAnalyzer()
    print(vader_analyzer.polarity_scores(text))
Пример #3
0
def main():
    text = read_file_1()
    para_tokens = get_words(text)
    sents = get_sentences(text)
    sent_tokens = [get_words(tokens) for tokens in sents]
    stopped_sent = [stopword_rem(sentence) for sentence in sent_tokens]
    text_score = score_keyphrases_by_textrank(text)

    freq = []
    for words, score in text_score:
        for w in words.split():
            freq.append(w)

    freq_uniq = get_frequency(freq, freq)

    aspect_list = [
        'laptop', 'keyboard', 'keys', 'screen', 'graphics', 'processor',
        'display', 'body', 'size', 'mouse', 'trackpad', 'track', 'battery',
        'sensors'
    ]

    scores = []
    words = []
    for word1 in aspect_list:
        for word2 in freq_uniq.keys():
            if word1 == word2:
                words.append(word1)
                scores.append(freq_uniq[word1])

    top10 = zip(words, scores)
    top10 = sorted(top10, key=itemgetter(1))
    top10.reverse()

    sents = read_file_1()
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = [sid.polarity_scores(sent) for sent in sents]
    bla = [[asdf['pos'], -1 * asdf['neg']] for asdf in sentiment_scores]
    bla.sort()
    qwer = range(len(bla))
    data = [go.Surface(z=bla, x=qwer, y=qwer)]
    layout = go.Layout(title='Canon G3',
                       autosize=False,
                       width=500,
                       height=500,
                       margin=dict(l=65, r=50, b=65, t=90))
    fig = go.Figure(data=data, layout=layout)
    py.plot(fig, filename='elevations-3d-surface')
def collectAllSentiments(bbcNewFldr):
    sentimentScoresForAnArticle = []
    for filename in os.listdir(bbcNewFldr):
        text = readFromSingleFile(bbcNewFldr, filename)
        newsTokens = [getWords(asdf.lower()) for asdf in text]
        stopWordRmdLst = [stopwordRemove(sentence) for sentence in newsTokens]
        sents = []
        for i in stopWordRmdLst:
            asdf = ''
            for j in i:
                asdf = asdf + j + ' '
            sents.append(asdf)
        sid = SentimentIntensityAnalyzer()
        allReviews = ' '.join(sent for sent in sents)
        sentiment_scores = sid.polarity_scores(allReviews)
        sentimentScoresForAnArticle.append(sentiment_scores)
    return sentimentScoresForAnArticle
def full_labeled(texts, keywords, outfile=None):

    labeled_tweets = []
    sia = SentimentIntensityAnalyzer()

    for text in texts:
        labels = tweet_classify(text, keywords)
        sentiment = sia.polarity_scores(text)['compound']
        if len(labels) > 0:
            labeled_tweets.append([labels, text, sentiment])

    if outfile is not None:

        num = len(labeled_tweets)
        outfile = 'labeled_tweets.csv'

        with open(outfile, 'w', encoding= 'utf-8') as f:
            writer = csv.writer(f)
            writer.writerows(labeled_tweets)
            print('Wrote {} lines to {}'.format(num, outfile))
    else:
        return labeled_tweets
Пример #6
0
def main():
    # read input text from json file
    # text = read_file(0)
    # tokenize the paragraph
    # para_tokens = get_words(text)
    # get sentences from the paragraph
    # sents = get_sentences(text)
    for i in range(len(files)):
        sents = read_file_1(i)
        sid = SentimentIntensityAnalyzer()
        sentiment_scores = [sid.polarity_scores(sent) for sent in sents]
        bla = [[asdf['pos'], -1 * asdf['neg']] for asdf in sentiment_scores]
        bla.sort()
        qwer = range(len(bla))
        data = [go.Surface(z=bla, x=qwer, y=qwer)]
        layout = go.Layout(title=str(files[i]),
                           autosize=False,
                           width=500,
                           height=500,
                           margin=dict(l=65, r=50, b=65, t=90))
        fig = go.Figure(data=data, layout=layout)
        py.plot(fig, filename=files[i])
Пример #7
0
    lineIn = '\n'.join(text)

    tagged = getTagsForWords(lineIn)
    NNP = getNounPositions('NNP', tagged)
    top3Noun = getTop3NounAndFreq(NNP)
    #perform the summarization multiple time and compute sentiment information each time
    for pi in percentageOfSummary:
        reducedSummaryWithReplc = mainSummaryCalling(lineIn, pi)
        review_tokens = [
            getWords(asdf.lower()) for asdf in reducedSummaryWithReplc
        ]
        stopped_sent = [stopwordRemove(sentence) for sentence in review_tokens]
        sents = []
        for i in stopped_sent:
            asdf = ''
            for j in i:
                asdf = asdf + j + ' '
            sents.append(asdf)
        sid = SentimentIntensityAnalyzer()
        allReviews = ' '.join(sent for sent in sents)
        sentiment_scores = sid.polarity_scores(allReviews)
        positiveScores.append(sentiment_scores['posScore'])
        negativeScores.append(sentiment_scores['negScore'])
        compoundScores.append(sentiment_scores['compoundScore'])
        sentiment_scoresForAnArticle.append([top3Noun, sentiment_scores])
        print(sentiment_scores['posScore'])
#plot the 3D column chart
plotMatrix3DColumnNounPosiNegiScore(sentiment_scoresForAnArticle)
#plot the 3D column chart
plotMatrix3DColumnNounCompundScoreOccurence(sentiment_scoresForAnArticle)
filenames=collectAllFileNms(amazonRevFldr)

reviewContent=collectAllReviews(filenames,amazonRevFldr)
#preprocessing the customer reviews        
reviewTokens = [getWords(asdf) for asdf in reviewContent if asdf!=None]
stopWordRmdLst = [stopwordRemove(sentence) for sentence in reviewTokens]

sents = []
for i in stopWordRmdLst:
    asdf = ''
    for j in i:
        asdf = asdf + j + ' '
    sents.append(asdf)

sid = SentimentIntensityAnalyzer()

sentimentScores=[]
#choose valid sentiment oputput
for sent in sents:
    if(len(sent)>0):
        sentimentScores.append(sid.polarity_scores(sent))

#make the sentiment classes as positive and negative
documents=[]
for si in sentimentScores:    
    if si['compoundScore']>=0:
        category=1#'pos'
    else:
        category=0#'neg'
    documents.append((list(si['wordsWithEmotion']),category))
Пример #9
0
    print(filenm)
    filenames.append(filenm)
product = []
reviewTitle = []
reviewContent = []
for i in range(len(filenames)):
    print(filenames[i])
    with open(path + '/' + filenames[i]) as dataFile:
        data = json.load(dataFile)
        product.append(data['ProductInfo'])
        for reviews in data['Reviews']:
            reviewTitle.append(reviews['Title'])
            reviewContent.append(reviews['Content'])

text = reviewContent  #text = read_file_2(3)
reviewTokens = [getWords(asdf) for asdf in text if asdf != None]
stoppWrdRmdSentence = [stopwordsRemove(sentence) for sentence in reviewTokens]
sents = []
for i in stoppWrdRmdSentence:
    asdf = ''
    for j in i:
        asdf = asdf + j + ' '
    sents.append(asdf)
#perform sentiment analysis on reviews
sid = SentimentIntensityAnalyzer()

sentiment_scores = [sid.polarity_scores(sent) for sent in sents]
##represent sentiment score to matrixs
(matrix, numOfEle) = sentimentScoresToMatrix(sentiment_scores)
#perform 3D surface plotting of matrix of sentiments
plotMatrix3DSurfByMatPlotLib(matrix, numOfEle, numOfEle)
Пример #10
0
        product.append(data['ProductInfo'])
        for reviews in data['Reviews']:
            reviewTitle.append(reviews['Title'])
            reviewContent.append(reviews['Content'])

text = reviewContent  #text = read_file_2(3)
reviewTokens = [getWords(asdf) for asdf in text if asdf != None]
stoppWrdRmdSentence = [stopwordsRemove(sentence) for sentence in reviewTokens]
sents = []
for i in stoppWrdRmdSentence:
    asdf = ''
    for j in i:
        asdf = asdf + j + ' '
    sents.append(asdf)
#perform sentiment analysis on reviews
sid = SentimentIntensityAnalyzer()

sentiment_scores = [sid.polarity_scores(sent) for sent in sents]
##represent sentiment score to matrixs
(matrix, numOfEle) = sentimentScoresToMatrix(sentiment_scores)
#write reviews and sentiments to excel
writeToExclReviewSentiStat(sents, sentiment_scores)

#perform sentiment analysis on all review comments
allReviews = ' '.join(sent for sent in sents)
sentiment_scoresAllRevs = sid.polarity_scores(allReviews)  # for all reviews
#compute word frequency
wordsAndFreqs = Counter(sentiment_scoresAllRevs['wordsWithEmotion'])
#compute word frequency and score
wordsFreqsScore = getWordFreqAndScore(sentiment_scoresAllRevs, wordsAndFreqs)
#perform sorting words based on their sentiment score in descending order
Пример #11
0
def demo_vader_tweets(n_instances=None, output=None):
    """
    Classify 10000 positive and negative tweets using Vader approach.

    :param n_instances: the number of total tweets that have to be classified.
    :param output: the output file where results have to be reported.
    """
    from collections import defaultdict
    from nltk.corpus import twitter_samples
    from vader import SentimentIntensityAnalyzer
    from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision,
        recall as eval_recall, f_measure as eval_f_measure)

    if n_instances is not None:
        n_instances = int(n_instances/2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False,
                        limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False,
                        limit=n_instances)

    pos_docs = parse_tweets_set(positive_csv, label='pos')
    neg_docs = parse_tweets_set(negative_csv, label='neg')

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs+train_neg_docs
    testing_tweets = test_pos_docs+test_neg_docs

    vader_analyzer = SentimentIntensityAnalyzer()

    gold_results = defaultdict(set)
    test_results = defaultdict(set)
    acc_gold_results = []
    acc_test_results = []
    labels = set()
    num = 0
    for i, (text, label) in enumerate(testing_tweets):
        labels.add(label)
        gold_results[label].add(i)
        acc_gold_results.append(label)
        score = vader_analyzer.polarity_scores(text)['compound']
        if score > 0:
            observed = 'pos'
        else:
            observed = 'neg'
        num += 1
        acc_test_results.append(observed)
        test_results[observed].add(i)
    metrics_results = {}
    for label in labels:
        accuracy_score = eval_accuracy(acc_gold_results,
            acc_test_results)
        metrics_results['Accuracy'] = accuracy_score
        precision_score = eval_precision(gold_results[label],
            test_results[label])
        metrics_results['Precision [{0}]'.format(label)] = precision_score
        recall_score = eval_recall(gold_results[label],
            test_results[label])
        metrics_results['Recall [{0}]'.format(label)] = recall_score
        f_measure_score = eval_f_measure(gold_results[label],
            test_results[label])
        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score

    for result in sorted(metrics_results):
            print('{0}: {1}'.format(result, metrics_results[result]))

    if output:
        output_markdown(output, Approach='Vader', Dataset='labeled_tweets',
            Instances=n_instances, Results=metrics_results)
def main():
    #read input text from json file
    text = read_file(0)

    #tokenize the paragraph
    para_tokens = get_words(text)

    #get sentences from the paragraph
    sents = get_sentences(text)

    #tokenize each sentence
    sent_tokens = [get_words(tokens) for tokens in sents]

    #remove stopwords from each sentence
    stopped_sent = [stopword_rem(sentence) for sentence in sent_tokens]

    #Extract aspects from the sample text
    text_score = score_keyphrases_by_textrank(text)

    #Calculate the frequency of the aspects
    freq = []
    for words, score in text_score:
        for w in words.split():
            freq.append(w)

    #Calculate the frequency of each word
    freq_uniq = get_frequency(freq, freq)

    #
    aspect_list = [
        'laptop', 'keyboard', 'keys', 'screen', 'graphics', 'processor',
        'display', 'body', 'size', 'mouse', 'trackpad', 'track', 'battery',
        'sensors'
    ]

    #get top 10 aspects based on frequency
    scores = []
    words = []
    for word1 in aspect_list:
        for word2 in freq_uniq.keys():
            if word1 == word2:
                words.append(word1)
                scores.append(freq_uniq[word1])

    top10 = zip(words, scores)
    top10 = sorted(top10, key=itemgetter(1))
    top10.reverse()

    #get the lines that the aspects occur in
    i = 0
    aspect_sent = []
    aspect_topic = []
    for qwer in sents:
        for top in top10:
            for bla in qwer.split():
                if top[0] == bla:
                    aspect_sent.append(i)
                    aspect_topic.append(top[0])
        i = i + 1

    # aspect_sent = zip(aspect_topic,aspect_sent)

    # aspect1 = ['graphics', 'screen', 'size']
    # aspect2 = ['keyboard', 'keys', 'key']

    aspect_sent_uniq = list(set(aspect_sent))

    sid = SentimentIntensityAnalyzer()

    sentiment_scores = [
        sid.polarity_scores(sents[i]) for i in aspect_sent_uniq
    ]

    pos_sents = ""
    neg_sents = ""

    j = 0
    print "\n\n Positive: \n"
    for i in range(0, len(aspect_sent_uniq)):
        if sentiment_scores[i]['pos'] > sentiment_scores[i]['neg'] and j < 10:
            print sents[aspect_sent_uniq[i]]
            pos_sents = sents[aspect_sent_uniq[i]] + " "
            j = j + 1

    print "\n\nPositive Sentences Polarity:"
    print sid.polarity_scores(pos_sents)

    j = 0
    print "\n\n Negative: \n"
    for i in range(0, len(aspect_sent_uniq)):
        if sentiment_scores[i]['neg'] > sentiment_scores[i]['pos'] and j < 10:
            print sents[aspect_sent_uniq[i]]
            neg_sents = sents[aspect_sent_uniq[i]] + " "
            j = j + 1

    print "\n\nNegitive Sentences Polarity:"
    print sid.polarity_scores(neg_sents)