def translate_de(self): str = '' for row in self.feedbackcomment: blob = TextBlobDE(row) if blob.detect_language() == "en": blob_en = blob else: blob_en = blob.translate(to="en") str = str + blob_en.string return str
def data_prep_to_predict(eintrag, freiab, freibis, mitgliedseit, miete, groesse, area, text): keywords = ['möbliert','unmöbliert','bitte','leider','Skype','besichtigung','xx',':\)'] # features to prepare: extra_sentiments = ['polarity_de', 'polarity_de_min', 'polarity_de_max', 'polarity_de_median'] column_list = ['miete_delta','groesse','days_to_freiab','days_to_rent',\ 'popular_area','new_user'] + keywords + extra_sentiments # popular_area and miete_delta areas = ['kreuzberg', 'wedding','neukoelln','charlottenburg','mitte','friedrichshain','prenzlauerberg','moabit'] popular_area = 0 miete_delta = miete-436 if clean_text(area) in areas: popular_area = 1 miete_delta = miete-470 #days_to_freiab days_to_freiab = abs((freiab - eintrag).days) #days_to_rent days_to_rent = abs((freibis - freiab).days) #polarity_de tb_obj = TextBlobDE(text) polarity_de = tb_obj.polarity sentences_polarity_de=[] for sentence in tb_obj.sentences: sentences_polarity_de.append(TextBlobDE(str(sentence)).polarity) polarity_de_median = np.median(sentences_polarity_de) polarity_de_min = np.min(sentences_polarity_de) polarity_de_max = np.max(sentences_polarity_de) sentiment_features = [polarity_de,polarity_de_min,polarity_de_max,polarity_de_median] #new_user new_user = 0 if abs((eintrag - mitgliedseit).days) < 30: new_user = 1 else: None #keyword features keyword_features = [] for word in keywords: if word in text: keyword_features.append(1) else: keyword_features.append(0) feature_list = [miete_delta, groesse, days_to_freiab, days_to_rent, \ popular_area, new_user] + keyword_features + sentiment_features features = pd.DataFrame(feature_list).T features.columns = column_list return features
def word_translate(inputtext, language): Aufgabe = { "Kopfzeile": "Name: Klasse: Datum: \n ", "Titel": "", "1. Aufgabe": "Übersetze!\n", "Hinweise": "Hier ist die Wortliste: \n", "Rätselwörter": "Hier ein paar Rätselwörter aus dem Text: \n", } doc = docx.Document() # initializing python-docx save_path = docxprint.docx_print(Doc=doc, save='word-translate') docxprint.docx_print(printText=Aufgabe["Kopfzeile"], Bold=True, Doc=doc) docxprint.docx_print(printText=Aufgabe["1. Aufgabe"], Bold=True, Doc=doc) nlp = languageload.language_load(language) docnlp = nlp(inputtext) #load to spacy #prepare to control, only unsorted and unfiltered inputtext_prepared = [] for token in docnlp: if str(token).isalpha() == True: inputtext_prepared.append(str(token)) inputtext_prepared = " \n ".join(inputtext_prepared) print(inputtext_prepared) blob = TextBlobDE(inputtext_prepared) translation = blob.translate( from_lang='en', to="de") # bg - bulgarisch, de - deutsch, en - englisch print(translation) wordlist_translated = translation.split("\n") inputtext_prepared = inputtext_prepared.split("\n") print(len(inputtext_prepared), len(wordlist_translated)) print(inputtext_prepared) print(wordlist_translated) result = [] for i in range(len(wordlist_translated)): result.append(inputtext_prepared[i].lower() + "\t-" + wordlist_translated[i]) result = list(set(result)) result.sort() docxprint.docx_print(printText=inputtext, Doc=doc) docxprint.docx_print(printText=Aufgabe["Hinweise"], Bold=True, Doc=doc) for translation in result: print(translation) docxprint.docx_print(printText=translation, Doc=doc) doc.save(save_path)
def test_word_lists_de(): animals = TextBlobDE("katze hund octopus ocropus aktienführer stammaktien syndikus anwälte ") pluralized_words = animals.words.pluralize() lemmatized_words = animals.words.lemmatize() blob = TextBlobDE("das ist ein deutscher Text mit asbjaskfbjjn als fremdwort salut! space") # this doesn't detect foreign words as such # link to see meaning of tags: http://blog.thedigitalgroup.com/sagarg/wp-content/uploads/sites/12/2015/06/POS-Tags.png tags = blob.tags for word in blob.words: print(word, "language is: ", word.detect_language()) # this takes google translator api requests print("done")
def analyse(comments): allcomments = [] polarity = [] for comment in comments: try: allcomments.append(comment) try: if detect(comment) == 'de': text = TextBlobDE(comment) x = text.sentiment.polarity polarity.append(x) elif detect(comment) == 'fr': blob = TextBlob(comment, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) x = blob.sentiment[0] polarity.append(x) else: text = TextBlob(comment) x = text.sentiment.polarity polarity.append(x) except: text = TextBlob(comment) x = text.sentiment.polarity polarity.append(x) except: pass return allcomments, polarity
def german_semantic(text): from nltk.corpus import stopwords from nltk.stem.cistem import Cistem stopwords = set(stopwords.words("german")) liste = [] stemmer = Cistem() wordlist = [] # clean up the text text = "".join(text.lower()) text = text.replace('[^\w\s]', '') text = re.sub("\s+", " ", text) # delete stopwords for word in text.split(): if word not in stopwords: liste.append(word) text = " ".join(liste) # stemmer for word in text.split(): word = stemmer.segment(word)[0] wordlist.append(word) text = " ".join(wordlist) # sentiment blob = TextBlobDE(text) sentiment_polarity = blob.sentiment.polarity sentiment_subjectivity = blob.sentiment.subjectivity return sentiment_polarity, sentiment_subjectivity
def _determine_polarity_textblob(self, text_series: Series) -> Series: """ for each paragraph (row in a series) the polarity is calculated with textblob :param text_series: series, containing the text where the polarity needs to be determined :return: series, containing rows with the polarity for the corresponding text """ tqdm.pandas(desc="Determine sentiment polarity with TextBlob") return text_series.progress_apply(lambda doc: TextBlobDE(doc).sentiment[0])
def blob_classify(text, id, collection): blob = TextBlobDE(text) client.spiegel[collection].update( {'_id': id}, {"$set": { 'blobPolarity': float(blob.sentiment.polarity) }}) print(blob.sentiment.polarity)
def complex_terms_satz(satz): word_list = TextBlobDE(satz).words for word in word_list: lemma = lemmatize_word(word) if find_word_status(lemma) == True: print('Leichtes Wort: ' + word) elif lemma in basic_german.keys(): easy_variant = basic_german[lemma] print(word + " hat eine leichte Alternative: " + str(easy_variant))
def lex_vereinfache(satz): word_list = TextBlobDE(satz).words for word in word_list: # print(word) lemma = lemmatize_word(word) # print(lemma) syns = synonyms(lemma) if syns != None: find_easy_syn(lemma, syns)
def get_de_tweet_sentiment(tweet): ''' Utility function to classify sentiment of passed tweet using textblob's sentiment method ''' # create TextBlob object of passed tweet text analysis = TextBlobDE(clean_tweet(tweet)) # set sentiment return analysis.sentiment.polarity
def sentiment_textblobde(self): textblobde_score = [ round(TextBlobDE(article).sentiment.polarity, 3) for article in self.feedbackcomment ] textblobde_category = [ 'positive' if score > 0 else 'negative' if score < 0 else 'neutral' for score in textblobde_score ] return textblobde_score, textblobde_category
def analyze(self, text): # Analyze the polarity of each text in the appropriate language. # Uses Textblob mainly because of its ease of implementation in multiple languages. # Dutch Textblob uses the same engine as the English one, but with special Pattern tagger and analyzer. if self.language == 'dutch': blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) elif self.language == 'english': blob = TextBlob(text) elif self.language == 'german': blob = TextBlobDE(text) return blob
def build_naive(): with open('raw_num_labeled', 'rb') as fs: training_list = pickle.load(fs) cl = NaiveBayesClassifier(training_list[:3000]) #cl.classify('das ist echt toll') print(cl.classify('das ist echt toll')) blob = TextBlobDE("Das ist super schade. Das tut mir so leid.") for s in blob.sentences: print(s) print(cl.classify(s)) print(cl.accuracy(training_list[3000:]))
def get_sentiment(text, language): if isinstance(text, str): if language == 'DE': blob = TextBlobDE(text) return [blob.sentiment.polarity, blob.sentiment.subjectivity] elif language == 'FR': tb = Blobber(pos_tagger=PatternTaggerFR(), analyzer=PatternAnalyzerFR()) blob = tb(text) return blob.sentiment else: blob = TextBlob(text) return [blob.sentiment.polarity, blob.sentiment.subjectivity]
def get_de_tweet_sentiment(tweet): ''' Utility function to classify sentiment of passed tweet using textblob's sentiment method ''' # create TextBlob object of passed tweet text analysis = TextBlobDE(clean_tweet(tweet)) # set sentiment if analysis.sentiment.polarity > 0: return 'positive' elif analysis.sentiment.polarity == 0: return 'neutral' else: return 'negative'
def get_article_sentiment(article): """ Extracts sentiment analysis for article. @param article: article dictionary (retrieved from the Data Lake) @returns: (article_level_polarity, article_level_subjectivity) """ if language_dict[article['media']] == 'DE': blob = TextBlobDE(article['text']) polarity, subjectivity = (blob.sentiment.polarity, blob.sentiment.subjectivity) elif language_dict[article['media']] == 'FR': tb = Blobber(pos_tagger=PatternTaggerFR(), analyzer=PatternAnalyzerFR()) blob = tb(article['text']) polarity, subjectivity = blob.sentiment else: # for now defaults to FR (just for PoC) blob = TextBlob(article['text']) polarity, subjectivity = (blob.sentiment.polarity, blob.sentiment.subjectivity) return polarity, subjectivity
def create_sen_indices(word2idx, data): """ function to convert a sentence to a list of indices :param word2idx: word to index mapping dictionary :param data: data(which needs to be transformed:list of sentences) :return list of list of sentences where each word is replaces by its index values from word2idx. """ idx = [] sen_indices = [] for sen in data: blob = TextBlobDE(sen) for w in blob.tokens: if w not in word2idx: idx.append(word2idx['<UNK>']) else: idx.append(word2idx[w]) sen_indices.append(idx) idx = [] return sen_indices
def nltk_parser(txt): myblob = TextBlobDE(txt) sent = [x[1] for x in myblob.tags] sent_text = [x[0] for x in myblob.tags] cfg_grammar = nltk.CFG.fromstring(""" S -> NP VP | S CC S NP -> 'DT' N | 'DT' N PP | 'PRP' | N | 'PRP$' VP -> V NP | V NP PP | V ADJP ADJP -> 'RB' 'JJ' | 'JJ' PP -> P NP N -> 'NN' | 'NNP' | 'NNS' | 'FW' V -> 'VBN' | 'VB' | 'MD' P -> 'IN' | 'TO' CC -> 'CC' O -> 'RP' | 'WDT' | 'TRUNC' | 'CD' """) parser = nltk.parse.ChartParser(cfg_grammar) for tree in parser.parse(sent): print(tree) tree.draw()
def from_dict(details): post = details["details"] blob = TextBlobDE(post["message"]) return Post(_id=post["post_id"], timestamp=dateutil.parser.parse(post["created_at"]), message=post["message"], tags=extract_tags(post["message"]), image_url=post.get("image_url"), thumbnail_url=post.get("thumbnail_url"), child_count=post["child_count"], banned=details["banned"], deleted=False, from_home=details.get("from_home"), color=post["color"], distance=post["distance"], location_name=post["location"]["name"], pin_count=post["pin_count"], share_count=post["share_count"], vote_count=post["vote_count"], readonly=details["readonly"], polarity=blob.polarity)
def text_analytics(analysis_request): """ Customer Service Text Analytics The Analytics endpoint returns both the sentiment and suggested response for a customer service text. :param analysis_request: The customer's serve text in base64 encoding :type analysis_request: dict | bytes :rtype: AnalysisResponse """ if connexion.request.is_json: analysis_request = AnalysisRequest.from_dict( connexion.request.get_json()) response = AnalysisResponse() if analysis_request.language_code.upper() == "DE": blob = TextBlobDE(analysis_request.customer_text) response.sentiment_score = (blob.sentiment.polarity + 1) / 2 else: blob = TextBlob(analysis_request.customer_text) response.sentiment_score = (blob.sentiment.polarity + 1) / 2 return response
def from_dict(reply, post): blob = TextBlobDE(reply["message"]) return Reply(_id=reply["post_id"], timestamp=dateutil.parser.parse(reply["created_at"]), post_id=reply["parent_id"], post_timestamp=post.timestamp, message=reply["message"], tags=extract_tags(reply["message"]), post_message=post.message, post_tags=post.tags, color=reply["color"], post_color=post.color, distance=reply["distance"], got_thanks=reply["got_thanks"], location_name=reply["location"]["name"], from_home=reply.get("from_home"), vote_count=reply["vote_count"], replier=reply["replier"], polarity=blob.polarity, post_pin_count=post.pin_count, post_share_count=post.share_count, post_vote_count=post.vote_count, post_polarity=post.polarity)
def termex(txt): myblob = TextBlobDE(txt) pos = myblob.tags for item in pos: if item[1].startswith('N'): terms.append(item)
text_long = adidas_text.read() #### Regular Expressions #### #alle Wörter, die mit "heit" enden heitwords = re.findall(r"\w+heit",text_long) ##print ("HEIT:", heitwords) #### Tokenizer #### #erst mit Textblob blob = TextBlobDE(text) print("SENTENCE TOKENIZER (TextBlobDE)") sentences = blob.sentences print (sentences) print("WORD TOKENIZER (TextBlobDE)") tokens = blob.tokens print(tokens) #dann mit NLTK print ("SENTENCE TOKENIZER (NLTK)") sent_detector = nltk.data.load('tokenizers/punkt/german.pickle')
# -*- coding: utf-8 -*- from textblob import TextBlob from textblob_de import TextBlobDE text = TextBlob("Markus is angry because he never gets the biggest chocolate.") print(text.tags) print(text.sentiment.polarity) print(text.sentiment) text = TextBlobDE("Markus ist wütend weil er nie die grösste Schokolade erhält.".decode('utf-8')) print(text.tags) print(text.sentiment.polarity) print(text.sentiment) text = TextBlobDE("Markus ist glücklich weil er immer die grösste Schokolade erhält.".decode('utf-8')) print(text.tags) print(text.sentiment.polarity) print(text.sentiment)
# WIN: spacy.load('en') geht nicht - dagegen spacy.load('en_core_web_sm') geht... # also immer nlp = spacy.load('en_core_web_sm') nehmen # wenn keine Permissions: pip install de_core_news_sm-2.0.0.tar (muss im gleichen Verzeichnis stehen) ==> manuelle installation # pip install .tar.gz archive from path or URL # ! pip install /Users/you/en_core_web_sm-2.0.0.tar.gz # import de_core_news_sm geht import de_core_news_sm from textblob_de import TextBlobDE import pandas as pd f1 = open("DHB.txt", mode="r", encoding="UTF8") text1 = f1.read() blob2 = TextBlobDE(text1) f1.close() blob2.sentences sens = pd.DataFrame(blob2.sentences) tgs = pd.DataFrame(blob2.tags) print(sens) #blob = TextBlobDE(text) blob2.tags # [('Der', 'DT'), ('Blob', 'NN'), ('macht', 'VB'), # ('in', 'IN'), ('seiner', 'PRP$'), ...] blob2.noun_phrases # WordList(['Der Blob', 'seiner unbekümmert-naiven Weise', # 'den gewissen Charme', 'hölzerne Regie', # 'konfuse Drehbuch']) """
consumer_secret = 'dvy2S7enpM4dpRCCr3LS7wvGRmZyvvV0Al6W1y1okJiiAQj3lK' access_token = '612923328-OgEMn0JdVBsV3bkPMMol7sv6eSC3l2Uh8rYo0uRe' access_token_secret = 'djVUXlaomK5AxQpr1ePr1gcIV5fIFgsKBvN1b9MBsITrk' print('Starting...') auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) print('Authenticated.') #Step 3 - Retrieve Tweets # Twitter API https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets query = '@uwyss' public_tweets = api.search(query, lang='de', count=1, result_type='recent') #CHALLENGE - Instead of printing out each tweet, save each Tweet to a CSV file #and label each one as either 'positive' or 'negative', depending on the sentiment #You can decide the sentiment polarity threshold yourself for tweet in public_tweets: #Step 4 Perform Sentiment Analysis on Tweets analysis = TextBlobDE(tweet.text) print(tweet.id, tweet.text) print(analysis.sentiment) print("")
def lemmatize_sentence(sentence): sblob = TextBlobDE(str(sentence)) for w in sblob.words: w_new = lemmatize_word(w) sentence = sentence.replace(w, w_new) return (sentence)
#allianz_text = open('allianz_JA_2012_Text.txt','w', encoding='utf-8') #Axel_Springer_JA_2012 #todo_file = open('Vulcanic_Triatherm_JA_2012.htm','r', encoding='utf-8') #todo_data = todo_file.read() #todo_text = open('Vulcanic_Triatherm_JA_2012_Text.txt','w', encoding='utf-8') #### HTML-Parser #### class MyHTMLParser(HTMLParser): def handle_data(self, data): todo_text.write(data) parser = MyHTMLParser() ##Das musste ich nur einmal machen: #parser.feed(todo_data) #text_long = adidas_text.read() #### Textblob initialisieren #### blob = TextBlobDE(text) ### was man aufmacht, muss man auch wieder zumachen #### #todo_file.close() #todo_text.close()
def preprocess_tweet(data): try: created_at = datetime.strptime(data['created_at'], '%a %b %d %H:%M:%S +0000 %Y') # detect the language of the tweet or use predefined language lang = classify( data['text'])[0] if not 'lang' in data else data['lang'] # remove urls using Imme Emosol regex: https://mathiasbynens.be/demo/url-regex text = re.sub(r"http\S+", "", data['text'], flags=re.MULTILINE) # tokenize the text dependent on the language blob = None if lang == 'en': blob = TextBlobEN(text) elif lang == 'de': blob = TextBlobDE(text) else: # avoid unknown languages raise UnknownLanguageException('Unknown language: ' + data['text']) # get the polarity of the tweet sentences and summerize them # NOTE: TextBlobDE is not as great as the english analyzer and is fairly barebone. # If the resulting polarity is inaccurate, one possibility to solve this is to # only process english tweets polarity = 0 polarity_count = 0 for sentence in blob.sentences: # ignore unimportant sentiment, because in most cases failed detection or hashtag parts from tweet if sentence.sentiment.polarity != 0.0: polarity = sentence.sentiment.polarity polarity_count += 1 if polarity_count > 0: polarity /= polarity_count # extract _important_ words from the word tokens words = [] is_hashtag = False is_tagged_user = False for tag in blob.tags: word = tag[0] kind = tag[1] # TODO: special behaviour for hashtag is possibly also necessary for @ if word[0] == '#': # special case means next word is a hashtag is_hashtag = True elif word[0] == '@': is_tagged_user = True else: if is_hashtag: # previous word was a hashtag, so remerge with # and save words.append("#" + word) is_hashtag = False elif is_tagged_user: words.append("@" + word) is_tagged_user = False elif any(word == s for s in BLACKLIST): continue else: # just normal word of the tweet # check the word is of an allowed grammatical type if kind[0] in ALLOWED_WORD_TOKENS: words.append(word) # find out where the tweet came from by either taking existing coordinates # or center of place # TODO: check if coordinates exist before using place # TODO: verify structure of place coordinates coords = [] if data['geo']: coords.append(data['geo']['coordinates']) elif data['coordinates']: coords.append(data['coordinates']['coordinates']) else: coords = data['place']['bounding_box']['coordinates'][0] loc = [0.0, 0.0] for coord in coords: loc[0] += coord[0] loc[1] += coord[1] loc[0] /= len(coords) loc[1] /= len(coords) # create tweet object tweet = { "_id": data['_id'], # use same id "user": { "name": data['user']['name'], "screen_name": data['user']['screen_name'], "followers_count": data['user']['followers_count'], "friends_count": data['user']['friends_count'], "listed_count": data['user']['listed_count'], "statuses_count": data['user']['statuses_count'], "following": data['user']['following'] }, "created_at": created_at, "words": words, "loc": loc, "polarity": polarity, "retweet_count": data['retweet_count'], "favorite_count": data['favorite_count'] } return tweet except UnknownLanguageException as error: # catch exceptions, usually failed language detection logging.warning(repr(error))