예제 #1
0
def add_sentiments(data,
                   field='headline',
                   boukinator_path=None,
                   danew_path=None):
    """Add sentiments to data frame"""
    text = data[field]

    if boukinator_path:
        logging.info('adding Boukes et al')
        boukinator = Boukinator(boukinator_path)
        data['boukes'] = text.map(lambda t: boukinator.classify(t)['score'])

    logging.info('adding "recessie" classifier')
    # classify messages containing the word 'recessie' (EN: recession) as negative (-1)
    data['recessie'] = text.str.contains('recessie').map(float) * -1

    def polygloter(t):
        try:
            return Text(t, hint_language_code='NL').polarity
        except ZeroDivisionError:
            logging.warning("Polgyglot failed: Divide by zero")
            return 0

    logging.info('adding Polyglot')
    data['polyglot'] = text.map(polygloter)

    logging.info('adding Pattern')
    data['pattern'] = text.map(lambda t: sentiment(t)[0])

    if danew_path:
        logging.info('adding DANEW')
        danew = DANEW(danew_path)
        data['DANEW'] = text.map(lambda t: danew.classify(t)['score'])
예제 #2
0
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
예제 #3
0
파일: test_nl.py 프로젝트: aburan28/pattern
 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(nl.sentiment("geweldig")[0] > 0)
     self.assertTrue(nl.sentiment("verschrikkelijk")[0] < 0)
     # Assert the accuracy of the sentiment analysis.
     # Given are the scores for 3,000 book reviews.
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-nl-bol.com.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: nl.positive(review), reviews)
     self.assertTrue(A > 0.80)
     self.assertTrue(P > 0.77)
     self.assertTrue(R > 0.85)
     self.assertTrue(F > 0.81)
     print "pattern.nl.sentiment()"
예제 #4
0
def FindOpinion(message, spellnl, nlnlp):
    tokenlist = wordtokenizer(message, nlnlp)
    tokenlist = [spellnl.correction(x) for x in tokenlist]
    message = ' '.join(tokenlist)
    sentimenttuple = sentiment(message)
    compoundsentiment = sentimenttuple[0]
    if compoundsentiment > 0.3:
        return 'Positive'
    elif compoundsentiment < -0.3:
        return 'Negative'
    else:
        return 'Neutral'
예제 #5
0
def repr_stats(st):
    #print u"Input object: %s" % st.text
    print u"Nr of paragraphs: %i" % st.p_count
    print u"Nr of whitespaces: %i" % st.nr_of_whitespaces
    print u"Nr of alphanumeric characters: %i" % st.nr_of_alpanumeric
    print u"Tokenized words: %s" % \
    (u",".join(st.tokenized_text_extended[0:20]) + u'....')
    print u"Tokenized (clean)words: %s" % \
    (u",".join(st.tokenized_text_short[0:20]) + u'....')
    print "Tokenized (nostop)words: %s" % \
    (u",".join(st.tokenized_text_nostop[0:20]) + u'....')
    print u"Nr of total tokens: %i" % len(st.tokenized_text_extended)
    print u"Nr of total (clean)tokens: %i" % len(st.tokenized_text_short)
    print u"Nr of total (nostop)tokens: %i" % len(st.tokenized_text_nostop)
    from collections import Counter
    c = Counter(st.tokenized_text_nostop)
    print u"Top ten nostop tokens:"
    print u"".join(
        [u"\t" + str(i) + u" freq: %i\n" % j for i, j in c.most_common(10)])
    print u"Usefull to 'useless' tokens ratio: %2.2f%%" % st.usefull_tokens_ratio
    print u"Avg word length (based on nostop): %2.2f" % st.avg_wordlen
    print u"Whitespace to text ratio: %2.2f%%" % st.whitespace_to_text_ratio
    print u"Upcase-starting words ratio (based on nostop): %2.2f%%" % st.upcase_ratio
    print u"Noise to text ratio: %2.2f%%" % st.noise_to_text_ratio
    print u"Nr of sentences: %i" % st.nr_of_sentences
    print u"Ratio nr of (nostop)tokens in dutch dict: %2.2f%%" % st.dict_check

    polarity_total = 0
    subjectivity_total = 0
    total_sentences = 0
    for s in st.sentences:
        polarity, subjectivity = sentiment(s)
        if not polarity == 0:
            polarity_total += polarity
            subjectivity_total += subjectivity
            total_sentences += 1
    if total_sentences > 0:
        polarity_total = polarity_total / (total_sentences)
        subjectivity_total = subjectivity_total / (total_sentences)

    print "Avg sentiment (polarity/subjectivity): %2.2f, %2.2f" % \
    (polarity_total, subjectivity_total)

    print "Clusters (nostop)tokens:"
    from pprint import pprint
    pprint(cluster.get_clusters(st.sentences))
    print "Ngrams:"
    from nltk.util import ngrams
    pprint(list(ngrams(st.tokenized_text_nostop, 3))[:10])
    print "==========================================\n\n"
예제 #6
0
def add_sentiments(data, field):
    from polyglot.text import Text
    from pattern.nl import sentiment
    from scripts.DANEW import DANEW
    from scripts.boukinator import Boukinator
    try:
        from resources.sentistrength.senti_client import multisent
    except ImportError:
        logger.warning("Cannot import sentistrenght, skipping!")
        multisent = None

    def polygloter(t):
        try:
            return Text(t, hint_language_code='NL').polarity
        except:
            logger.exception("Polgyglot failed")
            return 0

    if not '%s_polyglot' %field in data.columns:
        logger.info('adding Polyglot')
        data['%s_polyglot' %field] = data[field].map(polygloter)

    if not '%s_pattern' %field in data.columns:
        logger.info('adding Pattern')
        patterner = lambda t: sentiment(t)[0]
        data['%s_pattern' %field] = data[field].map(patterner)

    if not '%s_DANEW' %field in data.columns:
        logger.info('adding DANEW')
        danew = DANEW()
        danewer = lambda t: danew.classify(t)['score']
        data['%s_DANEW' %field] = data[field].map(danewer)

    if not '%s_boukes' %field in data.columns:
        logger.info('adding Boukes et al')
        boukinator = Boukinator()
        boukinatorer = lambda t: boukinator.classify(t)['score']
        data['%s_boukes' %field] = data[field].map(boukinatorer)

    if multisent and not '%s_sentistrength' %field in data.columns:
        logger.info('adding Sentistrength')
        sentistrength = multisent(language='NL')
        data['%s_sentistrength' %field] = [int(s['positive']) + int(s['negative']) for s in sentistrength.run_batch(data[field])]

    if not '%s_recessie' %field in data.columns:
        logger.info('adding "recessie" classifier')
        # classify messages containing the word 'recessie' (EN: recession) as negative (-1)
        data['%s_recessie' %field ] = data[field].str.contains('recessie').map(float)*-1

    return data
예제 #7
0
    def process(self, document_field, *args, **kwargs):
        '''Added sentiment based on Pattern'''
        try:
            language = kwargs['language']
        except:
            raise Exception(
                "Specify a language, for example language='nl'. We support nl, en, fr, and it"
            )

        if language == 'nl':
            try:
                from pattern.nl import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == 'en':
            try:
                from pattern.en import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == 'fr':
            try:
                from pattern.fr import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == 'it':
            try:
                from pattern.it import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        else:
            raise Exception(
                "Specify a language, for example language='nl'. We support nl, en, fr, and it"
            )

        sent = sentiment(document_field)
        return ({'polarity': sent[0], 'subjectivity': sent[1]})
예제 #8
0
    def process(self, document_field, *args, **kwargs):
        """Added sentiment based on Pattern"""
        try:
            language = kwargs["language"]
        except:
            raise Exception(
                "Specify a language, for example language='nl'. We support nl, en, fr, and it"
            )

        if language == "nl":
            try:
                from pattern.nl import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == "en":
            try:
                from pattern.en import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == "fr":
            try:
                from pattern.fr import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        elif language == "it":
            try:
                from pattern.it import sentiment
            except:
                raise Exception(
                    "Unavailable because you don't have the pattern library installed"
                )
        else:
            raise Exception(
                "Specify a language, for example language='nl'. We support nl, en, fr, and it"
            )

        sent = sentiment(document_field)
        return {"polarity": sent[0], "subjectivity": sent[1]}
예제 #9
0
cursor.execute(query)

weeks = []

# Deze gaan we nu verwerken
for (w, aanmeldingen, opmerking_array, titel_array) in cursor:
    pos, neg, neutraal = 0, 0, 0

    if opmerking_array is not None:
        opmerkingen = opmerking_array.split(",")
        titels = titel_array.split(",")
        titels.extend(["", "", "",
                       ""])  # there are some empty cells in the database
        for i in range(len(opmerkingen)):
            # we plakken de TITEL en OPMERKING aan elkaar vast en bepalen daar van het sentiment
            sentiment_analysis = sentiment(titels[i] + " " + opmerkingen[i])[0]
            if sentiment_analysis > 0.1:  # sentiment is Positief
                pos += 1
            elif sentiment_analysis < -0.1:  # sentiment is Negatief
                neg += 1
            else:
                neutraal += 1  # Het is ZEER moeilijk om het sentiment binnen de nederlandse taal te bepalen daarom heb ik neutraal ook toegevoegd als mogelijkheid
    else:
        opmerkingen = []

    weeks.append([w, aanmeldingen, len(opmerkingen), pos, neg, neutraal])

# Tabel
print("WEEK, \t AANME \t NIEUWS\t POS \t NEG \t NEUTRAAL")
for week in weeks:
    print "{0} \t {1} \t {2} \t {3} \t {4} \t {5}".format(
예제 #10
0
    def get_sentiment(text):
        sentiment = {
            "pattern"  : sentiment(text)[0],
            "polyglot" : Text(text, hint_language_code='nl').polarity

        }
예제 #11
0
# Import the sentiment analyse module from the pattern module
from pattern.nl import sentiment

neutralText = "Het is een vrij rustige dag."
positiveText = "Het product is mij zeer goed bevallen."
# negativeText = "Het product voldeed niet aan mijn verwachtingen."
# negativeText = "Je reageert niet op een manier die mij gelukkig stemt"
negativeText = "Diepbedroeft over de zeer gebrekkige service van deze onervaren werknemer, het ontslag van deze persoon zou mij zeer vrolijk maken"
# negativeText = "Ik ben niet te spreken over de service. Ik verwacht dan ook als compensatie mijn geld terug. Dit is stom"

contradictingText = "Dit is een voorbeeld van een overwegend positieve text over een negatief verhaal"

# Classify the text. The function returns 2 values.
# sentiment(text) returns (polarity, subjectivity).
sentimentAnalyse = sentiment(negativeText)

print sentimentAnalyse

# A text is quickly classified as negative. Because of that, we need to
# use a threshold to classify the text as positive, negative, or neutral.
# predictedSentiment = 'positive#' if sentimentAnalyse[0] > 0 else 'negative'
predictedSentiment = 'neutral'
if sentimentAnalyse[0] > 0.4:
    predictedSentiment = 'positive'
elif sentimentAnalyse[0] < -0.2:
    predictedSentiment = 'negative'

predictionSummary = "The provided text is classified as " + str(
    predictedSentiment) + ' which a score of ' + str(sentimentAnalyse[0])
예제 #12
0
def sentimentScore(text):
    return (sentiment(text)[0])
<<<<<<< HEAD
party_sentiment = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
=======
party_sentiment = defaultdict(lambda : defaultdict(list))
>>>>>>> e3934acbce5ba4326fddabba2ab9df8e3b56bcad

infile = codecs.open(args.i,"r","utf-8")
for line in infile.readlines():
    tokens = line.strip().split("\t")
<<<<<<< HEAD
    try:
        day = tokens[0]
        party = tokens[1]
        text = tokens[2]
        senti = sentiment(text)
        party_sentiment[party][day]["reg"].append(senti)
        if not re.search(' rt ',text,re.IGNORECASE) and not re.search('^rt',text,re.IGNORECASE):
            # print text
            party_sentiment[party][day]["rt"].append(senti)            

    except IndexError:
        continue

for party in party_sentiment.keys():
    for day in sorted(party_sentiment[party].keys()):
        sentiments = party_sentiment[party][day]["reg"]
        sentiments_filt = party_sentiment[party][day]["rt"]
        # print len(sentiments),len(sentiments_filt)
        # print sentiments
        print party, day, round(sum([ float(i[0]) for i in sentiments ])/float(len(sentiments)),2), round(np.std(sentiments),2), round(sum([ float(i[0]) for i in sentiments_filt ])/float(len(sentiments_filt)),2), round(np.std(sentiments_filt),2)
예제 #14
0
from pattern.nl import sentiment

print(sentiment('Een onwijs spannend goed boek!'))
예제 #15
0
def sentimentScore(text, debug=False):
    return (sentiment(text)[0])
예제 #16
0
 def get_polarity(self):
     lemmas = [t.lemma for t in self.tokens.elements()]
     polarity, subjectivity = sentiment(" ".join(lemmas))
     return polarity
예제 #17
0
def process_details(prod, params, force_refresh=False, cache_time=CACHE_TIME):
    tweets = cache(tweety.get_keyword,
                   prod,
                   force_refresh=force_refresh,
                   cache_time=CACHE_TIME,
                   **params)

    tweetList = []
    unique_tweets = {}
    interaction_tweets = []
    retweets = {}
    imagesList = []
    URLList = []
    word_cloud_dict = Counter()
    tsDict = Counter()
    mapLocations = []
    spam_list = []
    image_tweet_id = {}
    nodes = {}
    edges = []

    for i in range(len(tweets)):
        tw = tweets[i]
        tweet = tw["tweet"]
        lemmas = [t["lemma"] for t in tw["tokens"]]
        texts = [t["text"].lower() for t in tw["tokens"]]  # unlemmatized words
        words = list(set(lemmas + texts))  # to check for obscene words

        dt = datetime.strptime(tweet["created_at"],
                               "%a %b %d %H:%M:%S +0000 %Y")
        tsDict.update([(dt.year, dt.month, dt.day, dt.hour)])
        tweets[i]["tweet"]["datetime"] = datetime(
            dt.year, dt.month, dt.day,
            dt.hour)  # round to hour for peak detection

        # check for spam
        if any(obscene_words.get(t) for t in words):
            spam_list.append(tweet["id_str"])
            continue

        tweetList.append(tweet["id_str"])
        word_cloud_dict.update(lemmas)

        text = " ".join(texts)
        if text not in unique_tweets:
            unique_tweets[text] = tweet["id_str"]

        # track retweets and their retweet counts
        if "retweeted_status" in tweet:
            rt = tweet["retweeted_status"]
            id_str = rt["id_str"]
            retweet_count = rt["retweet_count"]
            if id_str not in retweets or retweet_count > retweets[id_str]:
                retweets[id_str] = retweet_count

        user_id_str = tweet["user"]["id_str"]
        if "retweeted_status" in tweet:
            rt_user_id_str = tweet["retweeted_status"]["user"]["id_str"]

            if rt_user_id_str not in nodes:
                nodes[rt_user_id_str] = tweet["retweeted_status"]["user"][
                    "screen_name"]
            if user_id_str not in nodes:
                nodes[user_id_str] = tweet["user"]["screen_name"]

            edges.append({
                "source": rt_user_id_str,
                "target": user_id_str,
                "value": "retweet"
            })

        if "user_mentions" in tweet["entities"]:
            if tweet["entities"]["user_mentions"]:
                interaction_tweets.append(tweet["id_str"])

            for obj in tweet["entities"]["user_mentions"]:
                if obj["id_str"] not in nodes:
                    nodes[obj["id_str"]] = obj["screen_name"]
                if user_id_str not in nodes:
                    nodes[user_id_str] = tweet["user"]["screen_name"]

                edges.append({
                    "source": user_id_str,
                    "target": obj["id_str"],
                    "value": "mention"
                })

        if tweet["in_reply_to_user_id_str"]:
            interaction_tweets.append(tweet["id_str"])

            if tweet["in_reply_to_user_id_str"] not in nodes:
                nodes[tweet["in_reply_to_user_id_str"]] = tweet[
                    "in_reply_to_screen_name"]
            if user_id_str not in nodes:
                nodes[user_id_str] = tweet["user"]["screen_name"]

            edges.append({
                "source": user_id_str,
                "target": tweet["in_reply_to_user_id_str"],
                "value": "reply"
            })

        try:
            for obj in tweet["entities"]["media"]:
                image_url = obj["media_url_https"]
                image_tweet_id[image_url] = tweet["id_str"]
                imagesList.append(image_url)
        except KeyError:
            pass

        try:
            for obj in tweet["entities"]["urls"]:
                url = obj["expanded_url"]
                if url is not None:
                    URLList.append(url)
        except KeyError:
            pass

        try:
            if tweet["coordinates"] is not None:
                if tweet["coordinates"]["type"] == "Point":
                    coords = tweet["coordinates"]["coordinates"]
                    mapLocations.append({"lng": coords[0], "lat": coords[1]})
        except KeyError:
            pass

    mark_as_spam.apply_async((spam_list, ), queue="web")

    def is_stop_word(token):
        t = token.lower()
        return (len(t) <= 1) or (t.startswith("https://") or
                                 t.startswith("http://")) or (t in stop_words)

    word_cloud = []
    for (token, count) in word_cloud_dict.most_common():
        if not is_stop_word(token):
            word_cloud.append({"text": token, "count": count})

    # sentiment analysis on wordcloud
    polarity, subjectivity = sentiment(" ".join(word_cloud_dict.elements()))

    ts = []
    try:
        tsStart = sorted(tsDict)[0]
        tsEnd = sorted(tsDict)[-1]
        temp = datetime(tsStart[0], tsStart[1], tsStart[2], tsStart[3], 0, 0)
        while temp <= datetime(tsEnd[0], tsEnd[1], tsEnd[2], tsEnd[3], 0, 0):
            if (temp.year, temp.month, temp.day, temp.hour) in tsDict:
                ts.append({
                    "year":
                    temp.year,
                    "month":
                    temp.month,
                    "day":
                    temp.day,
                    "hour":
                    temp.hour,
                    "count":
                    tsDict[(temp.year, temp.month, temp.day, temp.hour)]
                })
            else:
                ts.append({
                    "year": temp.year,
                    "month": temp.month,
                    "day": temp.day,
                    "hour": temp.hour,
                    "count": 0
                })

            temp += timedelta(hours=1)
    except IndexError:  # when there are 0 tweets
        pass

    # peak detection on time series
    y = np.array([t["count"] for t in ts])
    peaks = peakutils.indexes(y, thres=0.6, min_dist=1).tolist(
    )  # returns a list with the indexes of the peaks in ts

    # peak explanation: the most used words in tweets in the peak
    # the peak indices are sorted in ascending order
    if peaks:
        peak_index = 0
        new_peak = True
        peak_data = {}
        for tw in tweets:
            tweet = tw["tweet"]
            if new_peak:
                p = ts[peaks[peak_index]]
                dt = datetime(p["year"], p["month"], p["day"], p["hour"])
                peak_data[peak_index] = Counter()
                new_peak = False
            if tweet["datetime"] < dt:
                continue
            elif tweet["datetime"] == dt:
                lemmas = [token["lemma"] for token in tw["tokens"]]
                peak_data[peak_index].update(lemmas)
            else:
                new_peak = True
                peak_index += 1
                if peak_index == len(peaks):
                    break

        peaks = [(p, ", ".join(
            islice(
                filter(lambda x: not is_stop_word(x),
                       map(lambda x: x[0], peak_data[i].most_common())), 7)))
                 for (i, p) in enumerate(peaks)]

    lng = 0
    lat = 0
    if mapLocations:
        for loc in mapLocations:
            lng += loc["lng"]
            lat += loc["lat"]
            avLoc = {
                "lng": lng / len(mapLocations),
                "lat": lat / len(mapLocations)
            }
    else:
        avLoc = {"lng": 5, "lat": 52}

    images = []
    nsfw_list = []
    for (url, count) in Counter(imagesList).most_common():
        if len(images) >= 16:
            break
        nsfw_prob, status = get_nsfw_prob(url)
        if status == 200 and nsfw_prob > 0.8:
            nsfw_list.append(image_tweet_id[url])
        elif status == 200:
            images.append({"link": url, "occ": count})
    mark_as_spam.apply_async((nsfw_list, ), queue="web")

    urls = []
    for (url, count) in Counter(URLList).most_common():
        urls.append({"link": url, "occ": count})

    # limit number of nodes/edges
    edges = random.sample(edges, min(len(edges), 250))
    connected_nodes = set([e["source"]
                           for e in edges] + [e["target"] for e in edges])

    graph = {"nodes": [], "edges": []}
    for node in connected_nodes:
        graph["nodes"].append({"id": nodes[node]})

    for edge in edges:
        source = edge["source"]
        target = edge["target"]
        graph["edges"].append({
            "source": nodes[source],
            "target": nodes[target],
            "value": edge["value"]
        })

    unique_ids = list(unique_tweets.values())

    # retweet ids sorted from most to least tweeted
    if retweets:
        retweet_ids, _ = zip(
            *sorted(filter(lambda x: x[1] > 0, retweets.items()),
                    key=lambda x: x[1],
                    reverse=True))
    else:
        retweet_ids = []

    start = datetime.strptime(params["start"], time_format)
    end = datetime.strptime(params["end"], time_format)

    items = newsdb.find(
        {
            "keywords": prod,
            "pubdate": {
                "$gte": start,
                "$lt": end
            }
        },
        projection={
            "title": True,
            "pubdate": True,
            "description": True,
            "flag": True,
            "source": True,
            "link": True,
            "nid": True,
            "_id": False
        })
    news = sorted([it for it in items],
                  key=lambda x: x["pubdate"],
                  reverse=True)

    data = {
        "tweets": unique_ids,
        "retweets": retweet_ids,
        "interaction_tweets": interaction_tweets,
        "num_tweets": len(tweetList),
        "timeSeries": ts,
        "peaks": peaks,
        "URLs": urls,
        "photos": images,
        "tagCloud": word_cloud,
        "locations": mapLocations,
        "centerloc": avLoc,
        "graph": graph,
        "news": news,
        "polarity": polarity
    }
    return data
예제 #18
0
def determine_sentiment(body):
	""" this funciton determines and returns the average sentiment of sentences in the provided input string.
	It uses the pattern module to do so. Sentiment values may range from -1 to 1. """
	return np.mean([sentiment(sentence)[0] for sentence in sent_tokenize(body)]) 
예제 #19
0
def determine_subjectivity(body):
	""" This function determines and returns the average subjectivity of sentences in the provided input string.
	It uses the pattern module to do so. Subjectivity values may range from 0 to 1. """
	return np.mean([sentiment(sentence)[1] for sentence in sent_tokenize(body)]) 
예제 #20
0
import mysql.connector
from pattern.nl import sentiment

xstr = lambda s: s or ""

conn = mysql.connector.connect(user='******', password='******', host='____', database='____', port=12345)
conn2 = mysql.connector.connect(user='******', password='******', host='____', database='____', port=12345)

cursor = conn.cursor()
cursor2 = conn2.cursor()

query = ("SELECT id, titel, tekst  FROM ponydb.nieuws;")
cursor.execute(query)

# laten we de klassen maken
for (id, titel, tekst) in cursor:
    # sentiment(text) returns (polarity, subjectivity).
    sentimentAnalyse = sentiment(str(xstr(titel).encode('ascii', 'ignore')) +" "+ str(xstr(tekst).encode('ascii', 'ignore')))
    # print [titel, tekst, sentimentAnalyse[0], sentimentAnalyse[1]]
    query2 = ("UPDATE nieuws SET sentiment={1}, subjectiviteit={2} WHERE id={0}".format(id, sentimentAnalyse[0], sentimentAnalyse[1]))
    cursor2.execute(query2)


conn2.commit()

# close connections
cursor2.close(); conn2.close()
cursor.close(); conn.close()


예제 #21
0
파일: datamanager.py 프로젝트: uvacw/infra
def insert_lexisnexis(pathwithlnfiles, recursive):
    """
    Usage: insert_lexisnexis(pathwithlnfiles,recursive)
    pathwithlnfiles = path to a directory where lexis nexis output is stored
    recursive: TRUE = search recursively all subdirectories, but include only files ending on .txt
               FALSE = take ALL files from directory supplied, but do not include subdirectories
    """
    tekst = {}
    byline = {}
    section = {}
    length = {}
    loaddate = {}
    language = {}
    pubtype = {}
    journal = {}
    journal2={}
    pubdate_day = {}
    pubdate_month = {}
    pubdate_year = {}
    pubdate_dayofweek = {}

    if recursive:
        alleinputbestanden = []
        for path, subFolders, files in walk(pathwithlnfiles):
            for f in files:
                if isfile(join(path, f)) and splitext(f)[1].lower() == ".txt":
                    alleinputbestanden.append(join(path, f))
    else:
        # print  listdir(pathwithlnfiles)
        alleinputbestanden = [join(pathwithlnfiles, f) for f in listdir(pathwithlnfiles) if
                              isfile(join(pathwithlnfiles, f)) and splitext(f)[1].lower() == ".txt"]
        print alleinputbestanden
    artikel = 0
    for bestand in alleinputbestanden:
        print "Now processing", bestand
        with open(bestand, "r", encoding="utf-8", errors="replace") as f:
            i = 0
            for line in f:
                i = i + 1
                # print "Regel",i,": ", line
                line = line.replace("\r", " ")
                if line == "\n":
                    continue
                matchObj = re.match(r"\s+(\d+) of (\d+) DOCUMENTS", line)
                matchObj2 = re.match(r"\s+(\d{1,2}) (januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december) (\d{4}) (maandag|dinsdag|woensdag|donderdag|vrijdag|zaterdag|zondag)", line)
                matchObj3 = re.match(r"\s+(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})", line)
                matchObj4 = re.match(r"\s+(\d{1,2}) (January|February|March|April|May|June|July|August|September|October|November|December) (\d{4}) (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)", line)
                if matchObj:
                    artikel += 1
                    tekst[artikel] = ""
                    while True:
                        nextline=f.next()
                        if nextline.strip()!="":
                            journal2[artikel]=nextline.strip()
                            break
                    continue
                if line.startswith("BYLINE"):
                    byline[artikel] = line.replace("BYLINE: ", "").rstrip("\n")
                elif line.startswith("SECTION"):
                    section[artikel] = line.replace("SECTION: ", "").rstrip("\n")
                elif line.startswith("LENGTH"):
                    length[artikel] = line.replace("LENGTH: ", "").rstrip("\n").rstrip(" woorden")
                elif line.startswith("LOAD-DATE"):
                    loaddate[artikel] = line.replace("LOAD-DATE: ", "").rstrip("\n")
                elif matchObj2:
                    # print matchObj2.string
                    pubdate_day[artikel]=matchObj2.group(1)
                    pubdate_month[artikel]=str(MAAND[matchObj2.group(2)])
                    pubdate_year[artikel]=matchObj2.group(3)
                    pubdate_dayofweek[artikel]=matchObj2.group(4)
                elif matchObj3:
                    pubdate_day[artikel]=matchObj3.group(2)
                    pubdate_month[artikel]=str(MAAND[matchObj3.group(1)])
                    pubdate_year[artikel]=matchObj3.group(3)
                    pubdate_dayofweek[artikel]="NA"
                elif matchObj4:
                    pubdate_day[artikel]=matchObj4.group(1)
                    pubdate_month[artikel]=str(MAAND[matchObj4.group(2)])
                    pubdate_year[artikel]=matchObj4.group(3)
                    pubdate_dayofweek[artikel]=matchObj4.group(4)
                elif line.startswith("LANGUAGE"):
                    language[artikel] = line.replace("LANGUAGE: ", "").rstrip("\n")
                elif line.startswith("PUBLICATION-TYPE"):
                    pubtype[artikel] = line.replace("PUBLICATION-TYPE: ", "").rstrip("\n")
                elif line.startswith("JOURNAL-CODE"):
                    journal[artikel] = line.replace("JOURNAL-CODE: ", "").rstrip("\n")
                elif line.lstrip().startswith("Copyright ") or line.lstrip().startswith("All Rights Reserved"):
                    pass
                elif line.lstrip().startswith("AD/Algemeen Dagblad") or line.lstrip().startswith(
                        "De Telegraaf") or line.lstrip().startswith("Trouw") or line.lstrip().startswith(
                        "de Volkskrant") or line.lstrip().startswith("NRC Handelsblad") or line.lstrip().startswith(
                        "Metro") or line.lstrip().startswith("Spits"):
                    pass
                else:
                    tekst[artikel] = tekst[artikel] + " " + line.rstrip("\n")
    print "Done!", artikel, "articles added."

    if not len(journal) == len(journal2) == len(loaddate) == len(section) == len(language) == len(byline) == len(length) == len(tekst) == len(pubdate_year) == len(pubdate_dayofweek) ==len(pubdate_day) ==len(pubdate_month):
        print "!!!!!!!!!!!!!!!!!!!!!!!!!"
        print "Ooooops! Not all articles seem to have data for each field. These are the numbers of fields that where correctly coded (and, of course, they should be equal to the number of articles, which they aren't in all cases."
        print "journal", len(journal)
        print "journal2", len(journal2)
        print "loaddate", len(loaddate)
        print "pubdate_day",len(pubdate_day)
        print "pubdate_month",len(pubdate_month)
        print "pubdate_year",len(pubdate_year)
        print "pubdate_dayofweek",len(pubdate_dayofweek)
        print "section", len(section)
        print "language", len(language)
        print "byline", len(byline)
        print "length", len(length)
        print "tekst", len(tekst)
        print "!!!!!!!!!!!!!!!!!!!!!!!!!"
        print
        print "Anyhow, we're gonna proceed and set those invalid fields to 'NA'. However, you should be aware of this when analyzing your data!"


    else:
        print "No missing values encountered."

    suspicious=0
    for i in range(artikel):
        try:
            art_source = journal[i + 1]
        except:
            art_source = "NA"
        try:
            art_source2 = journal2[i + 1]
        except:
            art_source2 = "NA"

        try:
            art_loaddate = loaddate[i + 1]
        except:
            art_loaddate = "NA"
        try:
            art_pubdate_day = pubdate_day[i + 1]
        except:
            art_pubdate_day = "NA"
        try:
            art_pubdate_month = pubdate_month[i + 1]
        except:
            art_pubdate_month = "NA"
        try:
            art_pubdate_year = pubdate_year[i + 1]
        except:
            art_pubdate_year = "NA"
        try:
            art_pubdate_dayofweek = pubdate_dayofweek[i + 1]
        except:
            art_pubdate_dayofweek = "NA"
        try:
            art_section = section[i + 1]
        except:
            art_section = "NA"
        try:
            art_language = language[i + 1]
        except:
            art_language = "NA"
        try:
            art_length = length[i + 1]
        except:
            art_length = "NA"
        try:
            art_text = tekst[i + 1]
        except:
            art_text = "NA"
        try:
            tone=sentiment(art_text)
            art_polarity=str(tone[0])
            art_subjectivity=str(tone[1])
        except:
            art_polarity="NA"
            art_subjectivity="NA"
        try:
            art_byline = byline[i + 1]
        except:
            art_byline = "NA"

        # here, we are going to add an extra field for texts that probably are no "real" articles
        # first criterion: stock exchange notacions and similiar lists:
        ii=0
        jj=0
        for token in art_text.replace(",","").replace(".","").split():
            ii+=1
            if token.isdigit():
                jj+=1
        # if more than 25% of the tokens are numbers, then suspicious = True.
        art_suspicious = jj > .16 * ii
        if art_suspicious: suspicious+=1

        art = {"source": art_source.lower(), 'source2': art_source2.lower(), "loaddate": art_loaddate, "pubdate_day":art_pubdate_day, "pubdate_month":art_pubdate_month, "pubdate_year":art_pubdate_year, "pubdate_dayofweek":art_pubdate_dayofweek, "section": art_section.lower(),
               "language": art_language.lower(), 'length': art_length, "text": art_text, "byline": art_byline,
               "from-database": "lexisnexis", "suspicious":art_suspicious,"polarity":art_polarity,"subjectivity":art_subjectivity}
        article_id = collection.insert(art)
    print '\nInserted',len(tekst),"articles, out of which",suspicious,"might not be real articles, but, e.g., lists of stock shares. "
예제 #22
0
# -*- coding: utf-8 -*-
# Some of the strings that get scraped require utf-8

# pip install pattern
from pattern.nl import sentiment

neutralText = "Het is een vrij rustige dag."
positiveText = "Het product is mij zeer goed bevallen."
negativeText = "Diepbedroeft over de zeer gebrekkige service van deze onervaren werknemer."

text = "De Indiase diplomate die het middelpunt vormde van een hoogoplopende ruzie tussen de VS en India, heeft de VS verlaten. De Amerikaanse autoriteiten vroegen Devyani Khobragade, plaatsvervangend consul in New York, te vertrekken nadat ze officieel in staat van beschuldiging was gesteld wegens visumfraude. Het Indiase ministerie van Buitenlandse Zaken maakte kort daarop bekend dat de diplomate wordt overgeplaatst naar het departement in New Delhi. OnderbetaaldEen rechtbank in New York oordeelde gisteren dat Khobragade valse informatie had gegeven bij de aanvraag van een visum voor haar Indiase huishoudelijke hulp. Ook zou ze hebben verdoezeld dat deze hulp minder betaald kreeg dan het Amerikaanse wettelijk minimumloon. Khobragade's arrestatie en tijdelijke opsluiting, vorige week, wekten grote verontwaardiging in India. Boze Indiërs eisten een boycot van Amerikaanse producten in India."

# Classify the text. The function returns 2 values.
# sentiment(text) returns (polarity, subjectivity).
sentimentAnalyse = sentiment(" 10 Verschillen")

print sentimentAnalyse

# A text is quickly classified as negative. Because of that, we need to
# use a threshold to classify the text as positive, negative, or neutral.
# predictedSentiment = 'positive#' if sentimentAnalyse[0] > 0 else 'negative'
predictedSentiment = 'neutral'
if sentimentAnalyse[0] > 0.4:
    predictedSentiment = 'positive'
elif sentimentAnalyse[0] < -0.2:
    predictedSentiment = 'negative'

predictionSummary = "The provided text is classified as " + str(predictedSentiment) + ' which a score of ' + str(sentimentAnalyse[0])

print predictionSummary
예제 #23
0
def getFeatureValues(indexDictKeywords, indexDictWords, indexDictUser, indexDictDateTweet, indexDictDateEvent, maxList, perDict, indexDictTypes, otherFeatures, missing_value):
	finalList = []
	featureList = maxList * [0]			## create list with zero's
	categories = ['Sport','Politiek','Uitzending','Publieksevenement','Software','Bijzondere dag','Sociale actie','Celebrity nieuws','Reclame','Overig']
	total_db_list = []
	
	anchors = buildAnchorHash() ## get the anchors for DBpedia extraction
	
	## add the actual feature values
	
	for idx,line in enumerate(data):	
		if idx % 100 == 0:
			print idx,'van de', len(data)
			
		splitLine = line.split('\t')
		## check if we skip the event (annotated as non-event, only possible for labeled data)
	
		if labeled_data:
			if splitLine[8] == 'NA':
				skip_event = True
			else:
				skip_event = False	
		else:
			splitLine.insert(0,'ID')	## unlabeled data does not have an ID, add something irrelevant
			skip_event = False
		
		if not skip_event:			## check if it actually was an event (and not a non-event)
			featureList = maxList * [0]	
			
			## get all event information
			
			dateEvent, eventScore, dateEventString, keywords, keywordsFixed, keywordScores, allTweets = get_event_information(splitLine, categories)
			
			## add 1 (positive) at the right place in the feature-file using the dictionary
			
			featureList[0] = eventScore									## first feature is event score
			featureList = add_to_feature_dict(indexDictDateEvent, dateEventString, featureList)	
	
			for keyword in keywordsFixed:
				keyword = unicode(keyword, 'utf-8')
				featureList = add_to_feature_dict(indexDictKeywords, keyword, featureList)	
				
			allTweetsText = ''
			
			## loop over all tweets to obtain the right values for the tweet-features
			
			for tweet in allTweets:
				splitTweet = tweet.strip().split(',')
				if len(splitTweet) > 2:
					user = splitTweet[0].strip()				## add user information
					dateTweetString = splitTweet[1].strip()
					
					if len(user) < 16 and '-' in dateTweetString:
						neededTweet = ",".join(splitTweet[2:]).split()
						finalTweet = [x.strip() for x in neededTweet if len(x) > 1 or x.isalpha() or x.isdigit()]	## delete everything that is only 1 character and non-letter/digit
						allTweetsText += ' ' + (" ".join(finalTweet))
						keywordsInTweet = 0
						
						for word in finalTweet:
							add_word = unicode(word, 'utf-8')
							featureList = add_to_feature_dict(indexDictWords, add_word, featureList)	## add information about the word (note: can be binary or total number of occurences, right now it is total)
							if word in keywordsFixed:
								keywordsInTweet += 1				
						
						featureList[1] += keywordsInTweet			## add how often a keyword occured
						
						featureList[2] += len(finalTweet)			## keep track of total number of words as a feature
						
						user = unicode(user, 'utf-8')
						featureList = add_to_feature_dict(indexDictUser, user, featureList)	 ## add username as same way as bag-of-words
						
						## add date information to the featurelist
						
						featureList = add_to_feature_dict(indexDictDateTweet, dateTweetString, featureList)
						dateTweet = datetime.datetime.strptime(splitTweet[1].strip(),"%Y-%m-%d")
						beforeAfter, diff, diffTotal, absDiffTotal = getDateInformation(dateTweet, dateEvent)
						
			totalTweets = len(allTweets)
			featureList[3] = round(float(featureList[2]) / float(totalTweets),1)								## add average number of words per tweet
			featureList = addDateInformation(featureList, beforeAfter, diffTotal, absDiffTotal, totalTweets) 	## add all information regarding dates

			senti = sentiment(allTweetsText)						## add sentiment and subjective information
			featureList[12] = senti[0] + 1							## add +1 due to some classifiers not able to handle negative numbers (polarity)
			featureList[13] = senti[1]								## subjectivity
			
			featureList[14] = len(allTweets)						## add number of tweets
			
			per = getPeriodicityFeatures(keywordsFixed, keywordScores, dateEvent, perDict, missing_value)		## periodicity features
			for x in range(0, len(per)):		## add the features
				featureList[x+15] = per[x]
			
			featureList = getDbpediaFeatures(keywords, indexDictTypes, featureList, anchors)				## DBpedia features
			
			if labeled_data:
				featureList.append(int(splitLine[8]) -1)				## add the label as number
			
			finalList.append(featureList)								## keep track of the final featureList over all events
			
	return finalList