Пример #1
0
 def translate_de(self):
     str = ''
     for row in self.feedbackcomment:
         blob = TextBlobDE(row)
         if blob.detect_language() == "en":
             blob_en = blob
         else:
             blob_en = blob.translate(to="en")
         str = str + blob_en.string
     return str
Пример #2
0
def data_prep_to_predict(eintrag, freiab, freibis, mitgliedseit, miete, groesse, area, text):
	
	keywords = ['möbliert','unmöbliert','bitte','leider','Skype','besichtigung','xx',':\)']
	# features to prepare:
	extra_sentiments = ['polarity_de', 'polarity_de_min', 'polarity_de_max', 'polarity_de_median']
	column_list = ['miete_delta','groesse','days_to_freiab','days_to_rent',\
	'popular_area','new_user'] + keywords + extra_sentiments

	# popular_area and miete_delta
	areas = ['kreuzberg', 'wedding','neukoelln','charlottenburg','mitte','friedrichshain','prenzlauerberg','moabit']
	popular_area = 0
	miete_delta = miete-436
	if clean_text(area) in areas:
		popular_area = 1
		miete_delta = miete-470
		
	#days_to_freiab
	days_to_freiab = abs((freiab - eintrag).days)
	
	#days_to_rent
	days_to_rent = abs((freibis - freiab).days)
	
	#polarity_de
	tb_obj = TextBlobDE(text)
	polarity_de = tb_obj.polarity
	sentences_polarity_de=[]
	for sentence in tb_obj.sentences:
		sentences_polarity_de.append(TextBlobDE(str(sentence)).polarity)
	polarity_de_median = np.median(sentences_polarity_de)
	polarity_de_min = np.min(sentences_polarity_de)
	polarity_de_max = np.max(sentences_polarity_de)
	sentiment_features = [polarity_de,polarity_de_min,polarity_de_max,polarity_de_median]
    
	#new_user
	new_user = 0
	if abs((eintrag - mitgliedseit).days) < 30:
		new_user = 1
	else:
		None
	#keyword features
	keyword_features = []
	for word in keywords:
		if word in text:
			keyword_features.append(1)
		else:
			keyword_features.append(0)
			
	feature_list = [miete_delta, groesse, days_to_freiab, days_to_rent, \
	popular_area, new_user] + keyword_features + sentiment_features
	
	features = pd.DataFrame(feature_list).T
	features.columns = column_list
	
	return features
Пример #3
0
def word_translate(inputtext, language):
    Aufgabe = {
        "Kopfzeile": "Name: 				Klasse: 				Datum:  \n ",
        "Titel": "",
        "1. Aufgabe": "Übersetze!\n",
        "Hinweise": "Hier ist die Wortliste: \n",
        "Rätselwörter": "Hier ein paar Rätselwörter aus dem Text: \n",
    }
    doc = docx.Document()  # initializing python-docx
    save_path = docxprint.docx_print(Doc=doc, save='word-translate')

    docxprint.docx_print(printText=Aufgabe["Kopfzeile"], Bold=True, Doc=doc)
    docxprint.docx_print(printText=Aufgabe["1. Aufgabe"], Bold=True, Doc=doc)

    nlp = languageload.language_load(language)
    docnlp = nlp(inputtext)  #load to spacy

    #prepare to control, only unsorted and unfiltered
    inputtext_prepared = []
    for token in docnlp:
        if str(token).isalpha() == True:
            inputtext_prepared.append(str(token))
    inputtext_prepared = " \n ".join(inputtext_prepared)

    print(inputtext_prepared)
    blob = TextBlobDE(inputtext_prepared)
    translation = blob.translate(
        from_lang='en',
        to="de")  # bg - bulgarisch, de - deutsch, en - englisch
    print(translation)

    wordlist_translated = translation.split("\n")
    inputtext_prepared = inputtext_prepared.split("\n")

    print(len(inputtext_prepared), len(wordlist_translated))
    print(inputtext_prepared)
    print(wordlist_translated)

    result = []
    for i in range(len(wordlist_translated)):
        result.append(inputtext_prepared[i].lower() + "\t-" +
                      wordlist_translated[i])
    result = list(set(result))
    result.sort()

    docxprint.docx_print(printText=inputtext, Doc=doc)
    docxprint.docx_print(printText=Aufgabe["Hinweise"], Bold=True, Doc=doc)

    for translation in result:
        print(translation)
        docxprint.docx_print(printText=translation, Doc=doc)
    doc.save(save_path)
Пример #4
0
def test_word_lists_de():
    animals = TextBlobDE("katze hund octopus ocropus aktienführer stammaktien syndikus anwälte ")
    pluralized_words = animals.words.pluralize()
    lemmatized_words = animals.words.lemmatize()


    blob = TextBlobDE("das ist ein deutscher Text mit asbjaskfbjjn als fremdwort salut! space")
    # this doesn't detect foreign words as such
    # link to see meaning of tags: http://blog.thedigitalgroup.com/sagarg/wp-content/uploads/sites/12/2015/06/POS-Tags.png
    tags = blob.tags
    for word in blob.words:
        print(word, "language is: ", word.detect_language()) # this takes google translator api requests


    print("done")
Пример #5
0
def analyse(comments):

    allcomments = []
    polarity = []
    for comment in comments:
        try:
            allcomments.append(comment)
            try:
                if detect(comment) == 'de':
                    text = TextBlobDE(comment)
                    x = text.sentiment.polarity
                    polarity.append(x)
                elif detect(comment) == 'fr':
                    blob = TextBlob(comment,
                                    pos_tagger=PatternTagger(),
                                    analyzer=PatternAnalyzer())
                    x = blob.sentiment[0]
                    polarity.append(x)
                else:
                    text = TextBlob(comment)
                    x = text.sentiment.polarity
                    polarity.append(x)
            except:
                text = TextBlob(comment)
                x = text.sentiment.polarity
                polarity.append(x)
        except:
            pass

    return allcomments, polarity
def german_semantic(text):
    from nltk.corpus import stopwords
    from nltk.stem.cistem import Cistem
    stopwords = set(stopwords.words("german"))

    liste = []
    stemmer = Cistem()
    wordlist = []

    # clean up the text
    text = "".join(text.lower())
    text = text.replace('[^\w\s]', '')
    text = re.sub("\s+", " ", text)
    # delete stopwords
    for word in text.split():
        if word not in stopwords:
            liste.append(word)
    text = " ".join(liste)
    # stemmer
    for word in text.split():
        word = stemmer.segment(word)[0]
        wordlist.append(word)
    text = " ".join(wordlist)

    # sentiment
    blob = TextBlobDE(text)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity

    return sentiment_polarity, sentiment_subjectivity
Пример #7
0
 def _determine_polarity_textblob(self, text_series: Series) -> Series:
     """
     for each paragraph (row in a series) the polarity is calculated with textblob
     :param text_series: series, containing the text where the polarity needs to be determined
     :return: series, containing rows with the polarity for the corresponding text
     """
     tqdm.pandas(desc="Determine sentiment polarity with TextBlob")
     return text_series.progress_apply(lambda doc: TextBlobDE(doc).sentiment[0])
Пример #8
0
def blob_classify(text, id, collection):
    blob = TextBlobDE(text)
    client.spiegel[collection].update(
        {'_id': id},
        {"$set": {
            'blobPolarity': float(blob.sentiment.polarity)
        }})
    print(blob.sentiment.polarity)
def complex_terms_satz(satz):
    word_list = TextBlobDE(satz).words
    for word in word_list:
        lemma = lemmatize_word(word)
        if find_word_status(lemma) == True:
            print('Leichtes Wort: ' + word)
        elif lemma in basic_german.keys():
            easy_variant = basic_german[lemma]
            print(word + " hat eine leichte Alternative: " + str(easy_variant))
def lex_vereinfache(satz):
    word_list = TextBlobDE(satz).words
    for word in word_list:
        #        print(word)
        lemma = lemmatize_word(word)
        #        print(lemma)
        syns = synonyms(lemma)
        if syns != None:
            find_easy_syn(lemma, syns)
Пример #11
0
def get_de_tweet_sentiment(tweet):
    '''
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    '''
    # create TextBlob object of passed tweet text
    analysis = TextBlobDE(clean_tweet(tweet))
    # set sentiment
    return analysis.sentiment.polarity
Пример #12
0
 def sentiment_textblobde(self):
     textblobde_score = [
         round(TextBlobDE(article).sentiment.polarity, 3)
         for article in self.feedbackcomment
     ]
     textblobde_category = [
         'positive' if score > 0 else 'negative' if score < 0 else 'neutral'
         for score in textblobde_score
     ]
     return textblobde_score, textblobde_category
Пример #13
0
 def analyze(self, text):
     # Analyze the polarity of each text in the appropriate language.
     # Uses Textblob mainly because of its ease of implementation in multiple languages.
     # Dutch Textblob uses the same engine as the English one, but with special Pattern tagger and analyzer.
     if self.language == 'dutch':
         blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
     elif self.language == 'english':
         blob = TextBlob(text)
     elif self.language == 'german':
         blob = TextBlobDE(text)
     return blob
Пример #14
0
def build_naive():
    with open('raw_num_labeled', 'rb') as fs:
        training_list = pickle.load(fs)
    cl = NaiveBayesClassifier(training_list[:3000])
    #cl.classify('das ist echt toll')
    print(cl.classify('das ist echt toll'))
    blob = TextBlobDE("Das ist super schade. Das tut mir so leid.")

    for s in blob.sentences:
        print(s)
        print(cl.classify(s))
    print(cl.accuracy(training_list[3000:]))
Пример #15
0
def get_sentiment(text, language):
    if isinstance(text, str):
        if language == 'DE':
            blob = TextBlobDE(text)
            return [blob.sentiment.polarity, blob.sentiment.subjectivity]
        elif language == 'FR':
            tb = Blobber(pos_tagger=PatternTaggerFR(),
                         analyzer=PatternAnalyzerFR())
            blob = tb(text)
            return blob.sentiment
        else:
            blob = TextBlob(text)
            return [blob.sentiment.polarity, blob.sentiment.subjectivity]
def get_de_tweet_sentiment(tweet):
    '''
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    '''
    # create TextBlob object of passed tweet text
    analysis = TextBlobDE(clean_tweet(tweet))
    # set sentiment
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'
Пример #17
0
def get_article_sentiment(article):
    """
    Extracts sentiment analysis for article.
    @param article: article dictionary (retrieved from the Data Lake)
    @returns: (article_level_polarity, article_level_subjectivity)
    """
    if language_dict[article['media']] == 'DE':
        blob = TextBlobDE(article['text'])
        polarity, subjectivity = (blob.sentiment.polarity, blob.sentiment.subjectivity)
    elif language_dict[article['media']] == 'FR':
        tb = Blobber(pos_tagger=PatternTaggerFR(), analyzer=PatternAnalyzerFR())
        blob = tb(article['text'])
        polarity, subjectivity = blob.sentiment
    else:  # for now defaults to FR (just for PoC)
        blob = TextBlob(article['text'])
        polarity, subjectivity = (blob.sentiment.polarity, blob.sentiment.subjectivity)
    return polarity, subjectivity
Пример #18
0
    def create_sen_indices(word2idx, data):
        """
        function to convert a sentence to a list of indices
        :param word2idx: word to index mapping dictionary
        :param data: data(which needs to be transformed:list of sentences)
        :return list of list of sentences where each word is replaces by its index values from word2idx.
        """
        idx = []
        sen_indices = []
        for sen in data:
            blob = TextBlobDE(sen)

            for w in blob.tokens:
                if w not in word2idx:
                    idx.append(word2idx['<UNK>'])
                else:
                    idx.append(word2idx[w])
            sen_indices.append(idx)
            idx = []
        return sen_indices
Пример #19
0
def nltk_parser(txt):
    myblob = TextBlobDE(txt)
    sent = [x[1] for x in myblob.tags]
    sent_text = [x[0] for x in myblob.tags]
    cfg_grammar = nltk.CFG.fromstring("""
    S -> NP VP | S CC S
    NP -> 'DT' N | 'DT' N PP | 'PRP' | N | 'PRP$'
    VP -> V NP | V NP PP | V ADJP
    ADJP -> 'RB' 'JJ' | 'JJ'
    PP -> P NP
    N -> 'NN' | 'NNP' | 'NNS' | 'FW'
    V -> 'VBN' | 'VB' | 'MD'
    P -> 'IN' | 'TO'
    CC -> 'CC'
    O -> 'RP' | 'WDT' | 'TRUNC' | 'CD'
    """)
    
    parser = nltk.parse.ChartParser(cfg_grammar)
    for tree in parser.parse(sent):
        print(tree)
        tree.draw()
Пример #20
0
 def from_dict(details):
     post = details["details"]
     blob = TextBlobDE(post["message"])
     return Post(_id=post["post_id"],
                 timestamp=dateutil.parser.parse(post["created_at"]),
                 message=post["message"],
                 tags=extract_tags(post["message"]),
                 image_url=post.get("image_url"),
                 thumbnail_url=post.get("thumbnail_url"),
                 child_count=post["child_count"],
                 banned=details["banned"],
                 deleted=False,
                 from_home=details.get("from_home"),
                 color=post["color"],
                 distance=post["distance"],
                 location_name=post["location"]["name"],
                 pin_count=post["pin_count"],
                 share_count=post["share_count"],
                 vote_count=post["vote_count"],
                 readonly=details["readonly"],
                 polarity=blob.polarity)
Пример #21
0
def text_analytics(analysis_request):
    """
    Customer Service Text Analytics
    The Analytics endpoint returns both the sentiment and suggested response for a customer service text. 
    :param analysis_request: The customer&#39;s serve text in base64 encoding
    :type analysis_request: dict | bytes

    :rtype: AnalysisResponse
    """
    if connexion.request.is_json:
        analysis_request = AnalysisRequest.from_dict(
            connexion.request.get_json())
        response = AnalysisResponse()

        if analysis_request.language_code.upper() == "DE":
            blob = TextBlobDE(analysis_request.customer_text)
            response.sentiment_score = (blob.sentiment.polarity + 1) / 2
        else:
            blob = TextBlob(analysis_request.customer_text)
            response.sentiment_score = (blob.sentiment.polarity + 1) / 2

    return response
Пример #22
0
 def from_dict(reply, post):
     blob = TextBlobDE(reply["message"])
     return Reply(_id=reply["post_id"],
                  timestamp=dateutil.parser.parse(reply["created_at"]),
                  post_id=reply["parent_id"],
                  post_timestamp=post.timestamp,
                  message=reply["message"],
                  tags=extract_tags(reply["message"]),
                  post_message=post.message,
                  post_tags=post.tags,
                  color=reply["color"],
                  post_color=post.color,
                  distance=reply["distance"],
                  got_thanks=reply["got_thanks"],
                  location_name=reply["location"]["name"],
                  from_home=reply.get("from_home"),
                  vote_count=reply["vote_count"],
                  replier=reply["replier"],
                  polarity=blob.polarity,
                  post_pin_count=post.pin_count,
                  post_share_count=post.share_count,
                  post_vote_count=post.vote_count,
                  post_polarity=post.polarity)
Пример #23
0
def termex(txt):
    myblob = TextBlobDE(txt)
    pos = myblob.tags
    for item in pos:
        if item[1].startswith('N'):
            terms.append(item)
Пример #24
0
text_long = adidas_text.read()


#### Regular Expressions ####

#alle Wörter, die mit "heit" enden
heitwords = re.findall(r"\w+heit",text_long)

##print ("HEIT:", heitwords)
    
#### Tokenizer ####

#erst mit Textblob

blob = TextBlobDE(text)

print("SENTENCE TOKENIZER (TextBlobDE)")

sentences = blob.sentences
print (sentences)

print("WORD TOKENIZER (TextBlobDE)")

tokens = blob.tokens
print(tokens)

#dann mit NLTK

print ("SENTENCE TOKENIZER (NLTK)")
sent_detector = nltk.data.load('tokenizers/punkt/german.pickle')
Пример #25
0
# -*- coding: utf-8 -*-

from textblob import TextBlob
from textblob_de import TextBlobDE


text = TextBlob("Markus is angry because he never gets the biggest chocolate.")
print(text.tags)
print(text.sentiment.polarity)
print(text.sentiment)

text = TextBlobDE("Markus ist wütend weil er nie die grösste Schokolade erhält.".decode('utf-8'))
print(text.tags)
print(text.sentiment.polarity)
print(text.sentiment)

text = TextBlobDE("Markus ist glücklich weil er immer die grösste Schokolade erhält.".decode('utf-8'))
print(text.tags)
print(text.sentiment.polarity)
print(text.sentiment)
Пример #26
0
# WIN: spacy.load('en') geht nicht - dagegen spacy.load('en_core_web_sm') geht...
# also immer nlp = spacy.load('en_core_web_sm') nehmen
# wenn keine Permissions: pip install de_core_news_sm-2.0.0.tar (muss im gleichen Verzeichnis stehen) ==> manuelle installation
# pip install .tar.gz archive from path or URL
# ! pip install /Users/you/en_core_web_sm-2.0.0.tar.gz
# import de_core_news_sm geht

import de_core_news_sm

from textblob_de import TextBlobDE
import pandas as pd

f1 = open("DHB.txt", mode="r", encoding="UTF8")

text1 = f1.read()
blob2 = TextBlobDE(text1)
f1.close()
blob2.sentences
sens = pd.DataFrame(blob2.sentences)
tgs = pd.DataFrame(blob2.tags)
print(sens)

#blob = TextBlobDE(text)

blob2.tags  # [('Der', 'DT'), ('Blob', 'NN'), ('macht', 'VB'),
#  ('in', 'IN'), ('seiner', 'PRP$'), ...]

blob2.noun_phrases  # WordList(['Der Blob', 'seiner unbekümmert-naiven Weise',
#           'den gewissen Charme', 'hölzerne Regie',
#           'konfuse Drehbuch'])
"""
Пример #27
0
consumer_secret = 'dvy2S7enpM4dpRCCr3LS7wvGRmZyvvV0Al6W1y1okJiiAQj3lK'

access_token = '612923328-OgEMn0JdVBsV3bkPMMol7sv6eSC3l2Uh8rYo0uRe'
access_token_secret = 'djVUXlaomK5AxQpr1ePr1gcIV5fIFgsKBvN1b9MBsITrk'

print('Starting...')

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)
print('Authenticated.')

#Step 3 - Retrieve Tweets
# Twitter API https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
query = '@uwyss'
public_tweets = api.search(query, lang='de', count=1, result_type='recent')

#CHALLENGE - Instead of printing out each tweet, save each Tweet to a CSV file
#and label each one as either 'positive' or 'negative', depending on the sentiment
#You can decide the sentiment polarity threshold yourself

for tweet in public_tweets:

    #Step 4 Perform Sentiment Analysis on Tweets
    analysis = TextBlobDE(tweet.text)

    print(tweet.id, tweet.text)
    print(analysis.sentiment)
    print("")
Пример #28
0
def lemmatize_sentence(sentence):
    sblob = TextBlobDE(str(sentence))
    for w in sblob.words:
        w_new = lemmatize_word(w)
        sentence = sentence.replace(w, w_new)
    return (sentence)
Пример #29
0
#allianz_text = open('allianz_JA_2012_Text.txt','w', encoding='utf-8')
#Axel_Springer_JA_2012
#todo_file = open('Vulcanic_Triatherm_JA_2012.htm','r', encoding='utf-8')

#todo_data = todo_file.read()

#todo_text = open('Vulcanic_Triatherm_JA_2012_Text.txt','w', encoding='utf-8')

#### HTML-Parser ####


class MyHTMLParser(HTMLParser):
    def handle_data(self, data):
        todo_text.write(data)


parser = MyHTMLParser()

##Das musste ich nur einmal machen:
#parser.feed(todo_data)

#text_long = adidas_text.read()

#### Textblob initialisieren ####
blob = TextBlobDE(text)

### was man aufmacht, muss man auch wieder zumachen ####

#todo_file.close()
#todo_text.close()
Пример #30
0
def preprocess_tweet(data):
    try:
        created_at = datetime.strptime(data['created_at'],
                                       '%a %b %d %H:%M:%S +0000 %Y')
        # detect the language of the tweet or use predefined language
        lang = classify(
            data['text'])[0] if not 'lang' in data else data['lang']
        # remove urls using Imme Emosol regex: https://mathiasbynens.be/demo/url-regex
        text = re.sub(r"http\S+", "", data['text'], flags=re.MULTILINE)
        # tokenize the text dependent on the language
        blob = None
        if lang == 'en':
            blob = TextBlobEN(text)
        elif lang == 'de':
            blob = TextBlobDE(text)
        else:  # avoid unknown languages
            raise UnknownLanguageException('Unknown language: ' + data['text'])
        # get the polarity of the tweet sentences and summerize them
        # NOTE: TextBlobDE is not as great as the english analyzer and is fairly barebone.
        #	    If the resulting polarity is inaccurate, one possibility to solve this is to
        #		only process english tweets
        polarity = 0
        polarity_count = 0
        for sentence in blob.sentences:
            # ignore unimportant sentiment, because in most cases failed detection or hashtag parts from tweet
            if sentence.sentiment.polarity != 0.0:
                polarity = sentence.sentiment.polarity
                polarity_count += 1
        if polarity_count > 0:
            polarity /= polarity_count
        # extract _important_ words from the word tokens
        words = []
        is_hashtag = False
        is_tagged_user = False
        for tag in blob.tags:
            word = tag[0]
            kind = tag[1]
            # TODO: special behaviour for hashtag is possibly also necessary for @
            if word[0] == '#':  # special case means next word is a hashtag
                is_hashtag = True
            elif word[0] == '@':
                is_tagged_user = True
            else:
                if is_hashtag:  # previous word was a hashtag, so remerge with # and save
                    words.append("#" + word)
                    is_hashtag = False
                elif is_tagged_user:
                    words.append("@" + word)
                    is_tagged_user = False
                elif any(word == s for s in BLACKLIST):
                    continue
                else:  # just normal word of the tweet
                    # check the word is of an allowed grammatical type
                    if kind[0] in ALLOWED_WORD_TOKENS:
                        words.append(word)
        # find out where the tweet came from by either taking existing coordinates
        # or center of place
        # TODO: check if coordinates exist before using place
        # TODO: verify structure of place coordinates
        coords = []
        if data['geo']:
            coords.append(data['geo']['coordinates'])
        elif data['coordinates']:
            coords.append(data['coordinates']['coordinates'])
        else:
            coords = data['place']['bounding_box']['coordinates'][0]
        loc = [0.0, 0.0]
        for coord in coords:
            loc[0] += coord[0]
            loc[1] += coord[1]
        loc[0] /= len(coords)
        loc[1] /= len(coords)
        # create tweet object
        tweet = {
            "_id": data['_id'],  # use same id
            "user": {
                "name": data['user']['name'],
                "screen_name": data['user']['screen_name'],
                "followers_count": data['user']['followers_count'],
                "friends_count": data['user']['friends_count'],
                "listed_count": data['user']['listed_count'],
                "statuses_count": data['user']['statuses_count'],
                "following": data['user']['following']
            },
            "created_at": created_at,
            "words": words,
            "loc": loc,
            "polarity": polarity,
            "retweet_count": data['retweet_count'],
            "favorite_count": data['favorite_count']
        }
        return tweet
    except UnknownLanguageException as error:  # catch exceptions, usually failed language detection
        logging.warning(repr(error))