예제 #1
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.755)
     self.assertTrue(P > 0.760)
     self.assertTrue(R > 0.747)
     self.assertTrue(F > 0.754)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.642)
     self.assertTrue(P > 0.653)
     self.assertTrue(R > 0.607)
     self.assertTrue(F > 0.629)
     print "pattern.en.sentiment()"
예제 #2
0
 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(
         en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is
     # modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(
         lambda sentence: en.modality(sentence) > 0.5, sentences)
     #print(A, P, R, F)
     self.assertTrue(A > 0.69)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.64)
     self.assertTrue(F > 0.68)
     print("pattern.en.modality()")
예제 #3
0
 def test_sentiment_twitter(self):
     sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv")
     if os.path.exists(sanders):
         # Assert the accuracy of the sentiment analysis on tweets.
         # Given are the scores for Sanders Twitter Sentiment Corpus:
         # http://www.sananalytics.com/lab/twitter-sentiment/
         # Positive + neutral is taken as polarity >= 0.0,
         # Negative is taken as polarity < 0.0.
         # Since there are a lot of neutral cases,
         # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good.
         # Distinguishing negative from neutral from positive is a much
         # harder task
         from pattern.db import Datasheet
         from pattern.metrics import test
         reviews = []
         for i, id, date, tweet, polarity, topic in Datasheet.load(sanders):
             if polarity != "irrelevant":
                 reviews.append(
                     (tweet, polarity in ("positive", "neutral")))
         A, P, R, F = test(
             lambda review: en.positive(review, threshold=0.0), reviews)
         #print(A, P, R, F)
         self.assertTrue(A > 0.824)
         self.assertTrue(P > 0.879)
         self.assertTrue(R > 0.911)
         self.assertTrue(F > 0.895)
예제 #4
0
 def test_sentiment_twitter(self):
     sanders = os.path.join(PATH, "corpora", "polarity-en-sanders.csv")
     if os.path.exists(sanders):
         # Assert the accuracy of the sentiment analysis on tweets.
         # Given are the scores for Sanders Twitter Sentiment Corpus:
         # http://www.sananalytics.com/lab/twitter-sentiment/
         # Positive + neutral is taken as polarity >= 0.0,
         # Negative is taken as polarity < 0.0.
         # Since there are a lot of neutral cases,
         # and the algorithm predicts 0.0 by default (i.e., majority class) the results are good.
         # Distinguishing negative from neutral from positive is a much harder task
         from pattern.db import Datasheet
         from pattern.metrics import test
         reviews = []
         for i, id, date, tweet, polarity, topic in Datasheet.load(sanders):
             if polarity != "irrelevant":
                 reviews.append((tweet, polarity
                                 in ("positive", "neutral")))
         A, P, R, F = test(
             lambda review: en.positive(review, threshold=0.0), reviews)
         #print A, P, R, F
         self.assertTrue(A > 0.824)
         self.assertTrue(P > 0.879)
         self.assertTrue(R > 0.911)
         self.assertTrue(F > 0.895)
예제 #5
0
 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(
         en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(
             os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(lambda sentence: en.modality(sentence) > 0.5,
                       sentences)
     #print A, P, R, F
     self.assertTrue(A > 0.69)
     self.assertTrue(P > 0.71)
     self.assertTrue(R > 0.64)
     self.assertTrue(F > 0.67)
     print "pattern.en.modality()"
예제 #6
0
파일: test_en.py 프로젝트: teloon/pattern
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(
         en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(
         en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0]
         < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(
             os.path.join("corpora", "pang&lee-polarity.txt")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.71)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.70)
     self.assertTrue(F > 0.71)
     print "pattern.en.sentiment()"
예제 #7
0
 def test_intertextuality(self):
     # Evaluate accuracy for plagiarism detection.
     from pattern.db import Datasheet
     data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv"))
     data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data]
     def plagiarism(txt, src):
         return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05
     A, P, R, F = metrics.test(lambda x: plagiarism(*x), data)
     self.assertTrue(P > 0.96)
     self.assertTrue(R > 0.94)
     print("pattern.metrics.intertextuality()")
예제 #8
0
 def test_intertextuality(self):
     # Evaluate accuracy for plagiarism detection.
     from pattern.db import Datasheet
     data = Datasheet.load(os.path.join(PATH, "corpora", "plagiarism-clough&stevenson.csv"))
     data = [((txt, src), int(plagiarism) > 0) for txt, src, plagiarism in data]
     def plagiarism(txt, src):
         return metrics.intertextuality([txt, src], n=3)[0,1] > 0.05
     A, P, R, F = metrics.test(lambda x: plagiarism(*x), data)
     self.assertTrue(P > 0.96)
     self.assertTrue(R > 0.94)
     print "pattern.metrics.intertextuality()"
예제 #9
0
파일: test_nl.py 프로젝트: aburan28/pattern
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
예제 #10
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
예제 #11
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(fr.sentiment("fabuleux")[0] > 0)
     self.assertTrue(fr.sentiment("terrible")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 1,500 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for review, score in Datasheet.load(os.path.join(PATH, "corpora", "polarity-fr-amazon.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: fr.positive(review), reviews)
     self.assertTrue(A > 0.75)
     self.assertTrue(P > 0.76)
     self.assertTrue(R > 0.73)
     self.assertTrue(F > 0.75)
     print "pattern.fr.sentiment()"
예제 #12
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(fr.sentiment("fabuleux")[0] > 0)
     self.assertTrue(fr.sentiment("terrible")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 1,500 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for review, score in Datasheet.load(os.path.join(PATH, "corpora", "polarity-fr-amazon.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: fr.positive(review), reviews)
     self.assertTrue(A > 0.75)
     self.assertTrue(P > 0.76)
     self.assertTrue(R > 0.73)
     self.assertTrue(F > 0.75)
     print "pattern.fr.sentiment()"
예제 #13
0
파일: test_en.py 프로젝트: teloon/pattern
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join("corpora", "pang&lee-polarity.txt")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.71)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.70)
     self.assertTrue(F > 0.71)
     print "pattern.en.sentiment()"
예제 #14
0
        w = w.lower()
        w = w.strip(",.!?")
        if w in sentiment:
            score += sentiment[w]
            n += 1
    return score / (n or 1) > threshold


# Load the testing data.
data = Datasheet.load("books-fr.test.csv")
data.columns[1].map(lambda v: v == "True")

# I quickly annotated the top 50 adjectives and got
# P 0.56 and R 0.78, which approximates the performance of the SVM.
# We can probably get better scores by annotating more adjectives.
print test(lambda review: positive(review), data)
print

# We can also calculate kappa on the manual annotation scores.
# Kappa is a measurement of agreement or consensus.
# We want to know the general agreement of positive (+1) vs. negative (-1).
# If the agreement is low, that means the sentiment lexicon is biased,
# since the annotators did not agree on all scores.
scores = Datasheet.load("sentiment.csv - Sheet 1.csv", headers=True)
# 1) Cut off the first three columns.
scores = scores[:, 3:]
# 2) Remove empty fields (= annotator did not enter a score for this adjective).
scores = [[float(x) for x in row if x != ""] for row in scores]
# 3) Calculate the maximum number of different annotators.
n = max([len(row) for row in scores])
# 4) Keep only rows for which each annotator entered a score.
예제 #15
0
        w = w.lower()
        w = w.strip(",.!?")
        if w in sentiment:
            score += sentiment[w]
            n += 1
    return score / (n or 1) > threshold


# Load the testing data.
data = Datasheet.load("books-fr.test.csv")
data.columns[1].map(lambda v: v == "True")

# I quickly annotated the top 50 adjectives and got
# P 0.56 and R 0.78, which approximates the performance of the SVM.
# We can probably get better scores by annotating more adjectives.
print test(lambda review: positive(review), data)
print

# We can also calculate kappa on the manual annotation scores.
# Kappa is a measurement of agreement or consensus.
# We want to know the general agreement of positive (+1) vs. negative (-1).
# If the agreement is low, that means the sentiment lexicon is biased,
# since the annotators did not agree on all scores.
scores = Datasheet.load("sentiment.csv - Sheet 1.csv", headers=True)
# 1) Cut off the first three columns.
scores = scores[:, 3:]
# 2) Remove empty fields (= annotator did not enter a score for this adjective).
scores = [[float(x) for x in row if x != ""] for row in scores]
# 3) Calculate the maximum number of different annotators.
n = max([len(row) for row in scores])
# 4) Keep only rows for which each annotator entered a score.