def split_sentence_based_on_verbs(reviewText): review_spacy = nlp(reviewText) review_textblob = TextBlob(reviewText) if not review_textblob.detect_language() == 'en': review_textblob = review_textblob.translate(to='en') review_spacy = nlp(review_textblob.string) else: contains_romanian_words = 0 for word in review_textblob.words: word_textblob = TextBlob(word) if len(word_textblob.string) >= 3 and word_textblob.detect_language() == 'ro': contains_romanian_words = 1 break if contains_romanian_words == 1: new_reviewText = '' for word in review_spacy: word_textblob = TextBlob(word.orth_) if not word.is_title and len(word_textblob.string) >= 3: if word_textblob.detect_language() != 'ro': new_reviewText = new_reviewText + ' ' + word_textblob.string else: new_word = word_textblob.translate(to='en') new_reviewText = new_reviewText + ' ' + new_word.string else : new_reviewText = new_reviewText + ' ' + word_textblob.string # only_english_words = 0 # break review_textblob = TextBlob(new_reviewText) review_spacy = nlp(review_textblob.string) new_sentences = [] verbs_positions = [] for k in range(0, len(review_spacy)): if review_spacy[k].pos == VERB and review_spacy[k].dep_ == 'ROOT': verbs_positions.append(k) start = 0 if len(verbs_positions) > 0: for p in range(0, len(verbs_positions)): if p == len(verbs_positions) - 1: new_sentences.append(review_spacy[start:len(review_spacy)].text) else: q = verbs_positions[p] + 1 while q < len(review_spacy): if review_spacy[q].is_stop and ((review_spacy[q].pos == CONJ and (q < len(review_spacy)-1 and review_spacy[q-1].pos != review_spacy[q+1].pos)) or (review_spacy[q].pos == DET and review_spacy[q].lower_ in ['the', 'this', 'those', 'which', 'other', 'another']) or (review_spacy[q].pos == PUNCT and review_spacy[q] in [',', ';'])): new_sentences.append(review_spacy[start:q].text) start = q break q += 1 else: new_sentences.append(reviewText) return new_sentences
def translate_msg(message): try: if (len(message.text) > 3): b = TextBlob(unicode(message.text)) if (b.detect_language() == "ru"): tr_text = unicode(b.translate(to="en")) bot.send_message(message.chat.id, tr_text) if (b.detect_language() == "en"): tr_text = unicode(b.translate(to="ru")) bot.send_message(message.chat.id, tr_text) except Exception as e: print (e.message) bot.send_message(message.chat.id, "Sorry Boss,can't translate :(" " Try another message, please " + telegram.Emoji.KISSING_FACE)
def findLanguage(reducedList3): languageMap = {} currentNumber = 0 shuffle(reducedList3) for i in reducedList3: if currentNumber < 5000: if len(i[0]) > 5: try: b = TextBlob(unicode(i[0])) currentLanguage = b.detect_language() if currentLanguage in languageMap: languageMap[currentLanguage] += 1 else: languageMap[currentLanguage] = 1 except: pass currentNumber += 1 print currentNumber listOfWords = [] for i in languageMap: for x in range(0, languageMap[i]): listOfWords.append(i) listOfWordsCounter = collections.Counter(listOfWords) print 'Best Languages:', listOfWordsCounter.most_common(5) print languageMap
def update_book(book): blob = TextBlob(book.description) if blob.detect_language() == 'en': description = '' nouns = filter(lambda x: x[1] == 'NN' or x[1] == 'NNP', blob.tags) for noun, tag in nouns: description += noun + " " if len(noun) > 2: description += TextBlob(noun).translate(to='ko').string + " " else: description = book.description book_document = search.Document( doc_id=book.ISBN, fields=[ search.TextField(name='title', value=remove_punc(book.title)), search.TextField(name='author', value=remove_punc(book.author)), search.TextField(name='description', value=remove_punc(description)) ] ) index = get_book_index() index.put(book_document)
def scrape(self,links=[],ads=True,translator=False): responses = [] values = {} data = [] if ads: for link in links: r = requests.get(link) responses.append(r) else: for link in links: r = requests.get(link) text = unidecode(r.text) html = lxml.html.fromstring(text) links = html.xpath("//div[@class='cat']/a/@href") for link in links: if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3: time.sleep(random.randint(5,27)) try: responses.append(requests.get(link)) print link except requests.exceptions.ConnectionError: print "hitting connection error" continue for r in responses: text = r.text html = lxml.html.fromstring(text) values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content() values["link"] = unidecode(r.url) values["new_keywords"] = [] try: values["images"] = html.xpath("//img/@src") except IndexError: values["images"] = "weird index error" pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","") values["text_body"] = pre_decode_text try: values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","") except IndexError: values["posted_at"] = "not given" values["scraped_at"] = str(datetime.datetime.now()) body_blob = TextBlob(values["text_body"]) title_blob = TextBlob(values["title"]) values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api values["polarity"] = body_blob.polarity values["subjectivity"] = body_blob.sentiment[1] if values["language"] != "en" and not translator: values["translated_body"] = body_blob.translate(from_lang="es") values["translated_title"] = title_blob.translate(from_lang="es") else: values["translated_body"] = "none" values["translated_title"] = "none" text_body = values["text_body"] title = values["title"] values["phone_numbers"] = self.phone_number_parse(values) data.append(values) return data
def answer(question): global IsAnswer,detected,u IsAnswer = True DetectLang = TextBlob(question) detected = DetectLang.detect_language() if detected == 'en': print("language detected: en") u = 'en' print(len(words),"len(words)") low = question.lower() questions = re.sub('[^\w]',' ',low).split() #list BadWords(questions) print(questions) def writeout(words,question,IsAnswer): r = [] if len(words) > 3000: a1 = len(questions) for x in range(0,a1): words.remove(random.choice(words)) print(len(words),"len(words)") else: pass os.remove('newwords.txt') file = open('newwords.txt','w') words.extend(questions) r.extend(words) s = ' '.join(r) file.write(s) writeout(words,question,IsAnswer) randomthought() else: u = detected print("language detected:",u) randomthought()
def review_features_romanian(reviewText, type): review_spacy = nlp(reviewText) review_textblob = TextBlob(reviewText) review_spacy_ents = review_spacy.ents word_features_array = [] # print(review_textblob) if not review_textblob.detect_language() == 'en': review_textblob = review_textblob.translate(to='en') review_spacy = nlp(review_textblob.string) else: contains_romanian_words = 0 for word in review_textblob.words: word_textblob = TextBlob(word) if len(word_textblob.string) >= 3 and word_textblob.detect_language() == 'ro': contains_romanian_words = 1 break if contains_romanian_words == 1: new_reviewText = '' for word in review_spacy: word_textblob = TextBlob(word.orth_) if not word.is_title and len(word_textblob.string) >= 3: if word_textblob.detect_language() != 'ro': new_reviewText = new_reviewText + ' ' + word_textblob.string else: new_word = word_textblob.translate(to='en') new_reviewText = new_reviewText + ' ' + new_word.string else : new_reviewText = new_reviewText + ' ' + word_textblob.string review_textblob = TextBlob(new_reviewText) review_spacy = nlp(review_textblob.string) # print(review_spacy)w_spacy) for i in range(len(review_spacy)): word = review_spacy[i] # if not word.is_stop and not word.is_punct: if (word.pos == NOUN or (word.pos == VERB and TextBlob(word.orth_).sentiment.polarity > 0) or word.pos == ADJ or word.pos == ADV) and not word.is_punct: # if word.pos == NOUN: if type == labelType.Label.aspect: word_features_array.append(word_aspect_features(review_spacy, review_textblob, review_spacy_ents, i)) elif type == labelType.Label.attribute: word_features_array.append(word_attribute_features(review_spacy, review_textblob, review_spacy_ents, i)) elif type == labelType.Label.polarity: word_features_array.append(word_polarity_features(review_spacy, review_textblob, review_spacy_ents, i)) elif type == labelType.Label.emotion: word_features_array.append(word_emotion_features(review_spacy, review_textblob, review_spacy_ents, i)) return word_features_array
def translate_this(jenni, msg): t_msg = TextBlob(msg.groups()[0]) from_lang = t_msg.detect_language() if from_lang != 'en': translated = t_msg.translate(from_lang=from_lang, to='en') jenni.reply("{}".format(translated)) else: return
def scanForMultipleLanguages(target, words): langmap = makeLangPrefixMapping() langprefs = set() for word in words: blob = TextBlob(words) detect = blob.detect_language() if detect is not langmap[target]: langprefs.add(detect) return langprefs
def find_loc(p1): if p1 != "None": for city in cities_l: if city in p1.lower(): return city for k, v in cities_nn.iteritems(): if k in p1.lower(): return v t1 = TextBlob(p1.lower()) if "la" in p1.lower() and t1.detect_language() == "en": return "los angeles" return "None"
def rating(self): if self._rating: return self._rating elif len(self.text) > 3: blob = TextBlob(self.text) try: if blob.detect_language() == 'en': return round(min(max(blob.sentiment.polarity, -0.5), 0.5) * 4 + 3) except urllib.error.HTTPError: LOG.warning("Rating detection failed: HTTPError") return None
def getEngTag(self, tag): "Get the tag in English" tagName = TextBlob(tag.decode('utf-8')) tagName = tagName.words[0].singularize() if len(tagName) >= 3: lang = tagName.detect_language() if lang != 'en': tagName = tagName.translate(from_lang=lang, to='en') return tagName.encode('utf-8')
def handle(request): page = request.match_info.get('page') content = yield from fetch_page(page) text = strip(content) blob = TextBlob(text.decode('utf-8')) words = list({ w for w in blob.words if len(w) > 4}) words.sort() body = { 'sentences': len(blob.sentences), 'words': len(words), 'language': blob.detect_language(), 'blob': words } return web.Response(body=json.dumps(body).encode('utf-8'), content_type="application/json; charset=utf-8")
def echo(word, word_eol, userdata): global my_language try: original = TextBlob(word_eol[3][1:].decode("utf-8")) lang = original.detect_language() nick = word[0].split("!")[0].replace(":","") if lang != my_language: res = original.translate(from_lang=lang, to=my_language) if len(res) > 0: print("\037\00312" + nick + " said: " + str(res).replace( \ "\n","") + " (From lang=%s)" % str(lang)) return hexchat.EAT_NONE except: return hexchat.EAT_NONE
def parse_text_meta_data(self,html,values): if self.debug: print "Processing textual information - language, polarity, subjectivity.." body_blob = TextBlob(values["text_body"]) title_blob = TextBlob(values["title"]) values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api values["polarity"] = body_blob.polarity values["subjectivity"] = body_blob.sentiment[1] if values["language"] != "en" and not translator: values["translated_body"] = body_blob.translate(from_lang="es") values["translated_title"] = title_blob.translate(from_lang="es") else: values["translated_body"] = "none" values["translated_title"] = "none" return values
def filter_lang(texts, lang): """ Keep only texts identified as written in lang :texts: A list of texts to process :lang: The language we want to retain texts for :returns: list of texts classified as written in lang """ lang_texts = [] for text in texts: if len(text) > 3: blob = TextBlob(text) if blob.detect_language() == lang: lang_texts.append(text) return lang_texts
def on_status(self, status): tweet = TextBlob(re.sub(r"http\S+", "", status.text)) if len(tweet) < 4: return if tweet.detect_language() == 'en': result = str(tweet + " [" + str(tweet.polarity) + "]") tweet_text = str(tweet) if tweet.polarity >= POS_PARAM and tweet_text not in pos_tweets: pos_tweets.append(tweet_text) print("POSITIVE: " + result) if tweet.polarity <= NEG_PARAM and tweet_text not in neg_tweets: neg_tweets.append(tweet_text) print("NEGATIVE: " + result)
def onButtonPressed(self, button): textbuffer = tview_translate.get_buffer() start = textbuffer.get_start_iter() end = textbuffer.get_end_iter() textbuffer.delete(start, end) textbuffer = tview_text.get_buffer() start = textbuffer.get_start_iter() end = textbuffer.get_end_iter() text = u"{0}".format(textbuffer.get_text(start, end, False)) tree_iter = comboboxtext_to.get_active_iter() language_to = None if tree_iter is not None: model = comboboxtext_to.get_model() key, language_to = model[tree_iter][:2] tree_iter = comboboxtext_from.get_active_iter() language_from = None if tree_iter is not None: model = comboboxtext_from.get_model() key, language_from = model[tree_iter][:2] if language_to is not None: value = r.hget(text + ":" + language_to, language_from) if value: textbuffer = tview_translate.get_buffer() textbuffer.set_text(value) return blob = TextBlob(text) if language_from == 'detect': language_from = blob.detect_language() if language_from is None: translate = u"{0}".format(blob.translate(to=language_to)) else: translate = u"{0}".format(blob.translate( from_lang=language_from, to=language_to)) textbuffer = tview_translate.get_buffer() textbuffer.set_text(translate) if value is None: mapping = {language_from: translate} r.hmset(text + ":" + language_to, mapping)
def filter_data(line): global relevant_terms try: tweet = json.loads(line) if not 'delete' in tweet.keys(): blob = TextBlob(tweet['text']) if blob.detect_language() != 'pt': text_pt = unicode(str(blob.translate(to="pt")),'utf-8') else: text_pt = tweet['text'] text_pt = remove_accents(text_pt) for term in relevant_terms: if term in text_pt or term in tweet['text']: return True except: pass return False
def translate(text, from_language, to_language): """ translate: Translate from/to language. Uses Google Translate. Params: - text: The text that will be translated. - from_language: The language from. - to_language: The langue that will be translated. Results: - text translated in string format. """ textBlob = TextBlob(text.decode(ENCODING, 'ignore')) if not from_language: from_language = textBlob.detect_language() return textBlob.translate(from_lang=from_language, to=to_language)
def returnEntryVersusTarget(self, datalist): '''Some users write in a language that is different from their target language (i.e. if they are practicing a language that they didn't specify that they were learning, or if they are writing an entry in their native language asking someone to translate something for them). This function counts how many of these instances exist in the specified dataset.''' t0 = time() prefmap = makePrefixLangMapping() not_orig_lang = 0 for data in datalist: blob = TextBlob(data[self.ENTRY]) entrylang = blob.detect_language() islang = True for d in data[self.STUDYING].split(): if entrylang not in prefmap: continue if prefmap[entrylang] == d: continue not_orig_lang += 1 print("Took %s seconds" % (time() - t0)) print("Of %s entries, there are %s entries written in a different language than specified" % (len(datalist), not_orig_lang))
def _get_detailed_stats(no_code_text): """ Returns detailed stats on text :param no_code_text: String to analyse :return: list of details """ results = [] group_by = 'Detailed Text Statistics' tb = TextBlob(no_code_text) # Spell check here...it's very slow results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by)) results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by)) results.append(TextFeature('Number of words', len(tb.words), group_by)) results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by)) results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by)) results.append(TextFeature('Detected Language', tb.detect_language(), group_by)) results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by)) results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by)) results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by)) results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by)) return results
def compute_relevance(line): global relevant_terms tweet = json.loads(line) blob = TextBlob(tweet['text']) if blob.detect_language() != 'pt': text_pt = unicode(str(blob.translate(to="pt")),'utf-8') else: text_pt = tweet['text'] text_pt = remove_accents(text_pt) frequency = 0.0 for term in relevant_terms: if term in text_pt or term in tweet['text']: if term == 'socialbasebr': frequency += 10.0 else: frequency += 1.0 #mutual information value = 0.2*int(tweet['retweet_count']) + 0.3*int(tweet['favorite_count']) + 0.1*len(tweet['entities']['user_mentions']) + 0.4*frequency return (tweet['text'], value)
def detectLangauge(string): # Note langauges can be found here: # https://cloud.google.com/translate/v2/using_rest#language-params nlp = TextBlob(unicode(string, 'utf-8')) return nlp.detect_language()
class NLP(object): ''' NLP tools required : corpus_path ''' def __init__(self, _text, *args, **kwargs): # print "init NLP" # try: # _text.decode("utf-8") # except UnicodeDecodeError : # print "ok" # pass self.text = _text self.blob = TextBlob(_text) self.sentences = self.blob.sentences def words(self): return self.blob.words def count_words(self): return dict(self.blob.word_counts) def get_language(self): return self.blob.detect_language() def keywords(self): t0 = time() # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps ([A-Z])(\.[A-Z])+\.? # abbreviations, e.g. U.S.A. | \w+(-\w+)* # words with optional internal hyphens | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens ''' toks = nltk.regexp_tokenize(self.text, sentence_re) postoks = nltk.tag.pos_tag(toks) # postoks = self.blob.pos_tags tree = chunker.parse(postoks) # print tree def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'): yield subtree.leaves() def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem_word(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(tree): for leaf in leaves(tree): # print leaf term = [normalise(w) for w, t in leaf if acceptable_word(w)] yield term terms = get_terms(tree) words = [] for term in terms: for word in term: words.append(word) print "Done in %fs" % (time() - t0) return words def analyze_sentiment(self): sentiments = [] for sentence in self.sentences: sentiments.append({ "polarity": sentence.sentiment.polarity, "subjectivity": sentence.sentiment.subjectivity }) return sentiments def get_adjectives(self): adj = [] for word, POStag in sorted(set(self.blob.tags)): if POStag == "JJ": adj.append(str(word)) return adj # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html def start_with_number(self, s): data = [c for c in s if c in '0123456789Xx'] if len(data) != 0: return True else: return False def filter_out_nastyness(self): '''Filter out proper nouns (NNP & NNPS), numbers (CD) and symbols (SYM)''' to_filter_out = ["ed.", '"'] for word, tag in self.blob.tags: if tag == "NNP" \ or tag == "NNPS" \ or tag == "CD" \ or tag == "SYM" \ or word[0:4] == "doi:" or word[0:4] == "isbn" or word[0:4] == "ISBN" \ or self.start_with_number(word) == True \ or any(x.isupper() for x in word[2:]) == True \ or tag == "FW": to_filter_out.append(word) regex = re.compile('\\b(%s)\\W'%('|'.join(map(re.escape,to_filter_out))),re.UNICODE) clean = regex.sub(" ", self.text) # clean.decode("utf-8") clean_ok = ''.join([i for i in clean if i not in '()#']) clean_ok=clean_ok.replace("R.sub","") # clean_ok.decode("utf-8") return clean_ok def get_clean_text(self): txt=self.filter_out_nastyness() return txt def get_verbs(self): verbs = [] for word, POStag in sorted(set(self.blob.tags)): if POStag == "VB": verbs.append(str(word)) return verbs def get_nouns(self): nouns = [] for word, POStag in sorted(set(self.blob.tags)): if POStag == "NN": nouns.append(str(word)) return nouns def get_noun_phrases(self): nouns = [] for word in self.blob.noun_phrases: nouns.append(str(word)) return nouns def translate_to(self, _language): return self.blob.translate(to=_language)
business_info = lookup.business_dict(YELP_BIZ_FILE) # Organize reviews into Business - User - Review(s) tree structure n, review_tree = 0, {} with open(YELP_DATA_FILE) as data_file: for line in data_file: review_data = json.loads(line) business_id = review_data['business_id'] user_id = review_data['user_id'] review_text = review_data['text'].replace('\n', ' ') review_rate = int(review_data['stars']) if business_info[business_id][0] not in US_STATES: if len(review_text) < 3: continue blob_review = TextBlob(review_text) if blob_review.detect_language() != 'en': continue if n % 50000 == 0: print n n += 1 if review_tree.get(business_id) is None: review_tree[business_id] = {user_id: [(review_text, review_rate)]} elif review_tree[business_id].get(user_id) is None: review_tree[business_id][user_id] = [(review_text, review_rate)] else: review_tree[business_id][user_id].append((review_text, review_rate)) # Truncate on two conditions: 1 user-business has 5+ reviews; 1 business has 500+ reviews def append_review(review_per_business): n_per_business = 0 for user_id, reviews in review_per_business.iteritems(): n_per_user = min(len(reviews), REVIEW_LIMIT[0])
def is_target_language(text): from textblob import TextBlob blob = TextBlob(text) if len(text) >= 3 and blob.detect_language() in TARGET_LANGUAGE: return True return False
def findlanguage(inputs): # Uses Textblob to find the language of the inputted text words = TextBlob(inputs) #print("Detected Language : ", words.detect_language()) return words.detect_language()
import speech_recognition as sr from textblob import TextBlob r = sr.Recognizer() mic = sr.Microphone() with mic as source: r.adjust_for_ambient_noise(source) print('Recording... Please speak now.') audio = r.listen(source) trans = r.recognize_google(audio, language='es-ES') print(trans) blob2 = TextBlob(trans) lang = blob2.detect_language() newline = '\n' print(f'Detected language: {lang}. {newline}Getting sentiment polarity...') if lang == 'en': blob2_ready = blob2 else: blob2_ready = blob2.translate(to='en') sentiment = blob2_ready.sentiment.polarity print(f'{newline}Sentiment polarity: {sentiment}. {newline}This means:') if sentiment == 0: print('Customer was neutral.')
# Authentification auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) # Get API api = tweepy.API(auth) # Get user user = api.get_user(args.user) # Get statuses for status in api.user_timeline(screen_name=args.user, count=200): # Analyse tweet tweet = TextBlob(status.text) # Show sentiment analysis print(u"Tweet \"{}\"".format(status.text)) print(u"Polarity {}, Subjectivity {}".format(tweet.sentiment.polarity, tweet.sentiment.subjectivity)) print(u"Language : {}".format(tweet.detect_language())) try: print(u"French : {}".format(tweet.translate(from_lang="en-US", to='fr'))) except textblob.exceptions.NotTranslated: pass # end try print(u"Tokens : {}".format(tweet.words)) print(u"") # end for
Лапочкин Д. 40% """ from textblob import TextBlob text = input('Введите текст: ') blob = TextBlob(text) syllables = 0 sentence = 0 fre = 0 for b in ['.', '!', '?']: if text.count(b) > 0: sentence += text.count(b) if blob.detect_language() == 'ru': syllables = sum(1 for x in text.lower() if x in 'уеоаыяиюэ') else: syllables = sum(1 for x in text.lower() if x in 'aeiouy') asl = (text.count(' ') + 1) / sentence asw = syllables / (text.count(' ') + 1) if blob.detect_language() == 'ru': fre = 206.835 - (1.3 * asl) - (60.1 * asw) else: fre = 206.835 - (1.015 * asl) - (84.6 * asw) if blob.detect_language() == 'ru': blob = blob.translate(to="en") if blob.sentiment.polarity > 0.33: tonality = 'положительный'
def main(): """Text Analysis App """ st.title("Language Detector & Translator") image = Image.open("people_speaking.jpg") st.sidebar.image(image, caption="Different languages", use_column_width=True) activities = ["Detector & Translator", "About"] choice = st.sidebar.selectbox("Menu", activities) if choice == 'Detector & Translator': st.subheader("Text Area") lista_modos = ("For 23 languages", "For selected languages") modo = st.sidebar.radio("Choose", lista_modos) texto_default = 'Text' raw_text = st.text_area("Copy&Paste -> Ctrl+Enter", texto_default) blob = TextBlob(raw_text) # Audioplay #if st.button("Audio"): # play(raw_text) if modo == "For selected languages": #texto_default = 'Texto' #raw_text = st.text_area("Copy&Paste -> Ctrl+Enter",texto_default) #blob = TextBlob(raw_text) try: if (raw_text == " " or raw_text == " " or raw_text == " " or raw_text == " "): st.error("Please write something in the text area") elif (raw_text != texto_default) and len(raw_text) > 0 and ( raw_text != " " or raw_text != " " or raw_text != " " or raw_text != " "): dict_idioma_full = lista_idiomas_full() idioma_original = get_value(blob.detect_language(), dict_idioma_full) original_key = get_key(idioma_original, dict_idioma_full) st.success("Original Language" + ": " + idioma_original + " (" + original_key + ")") dict_idioma = lista_idiomas(idioma_original) options = st.multiselect("Choose a language", tuple(dict_idioma.values())) idioma_final = get_key(idioma_original, dict_idioma) #st.write("Original language:",idioma_original) for i in range(len(options)): value = options[i] idioma_final = get_key(value, dict_idioma) if (idioma_original != idioma_final): texto_convertido = blob.translate(to=idioma_final) st.success("Language" + ": " + value + " (" + idioma_final + ")") st.text(texto_convertido) #play(texto_convertido,idioma_final) except: st.error( "ERROR: text must be at least 3 letters and the word must exist in the formal language" ) else: try: flag = False if (raw_text == " " or raw_text == " " or raw_text == " " or raw_text == " "): st.error("Please write something in the text area") elif (raw_text != texto_default) and len(raw_text) > 0 and ( raw_text != " " or raw_text != " " or raw_text != " " or raw_text != " "): dict_idioma_full = lista_idiomas_full() idioma_original = get_value(blob.detect_language(), dict_idioma_full) original_key = get_key(idioma_original, dict_idioma_full) st.success("Original Language" + ": " + idioma_original + " (" + original_key + ")") dict_idioma = lista_idiomas(idioma_original) options = dict_idioma.values() st.write("Original Language:", idioma_original) idioma_lista = list(options) for i in range(len(idioma_lista)): value = idioma_lista[i] #st.text(value) idioma_final = get_key(value, dict_idioma) if (idioma_original != idioma_final): texto_convertido = blob.translate(to=idioma_final) st.success("Language" + ": " + value + " (" + idioma_final + ")") st.text(texto_convertido) flag = True except: if flag != True: st.error( "ERROR: text must be at least 3 letters and the word must exist in the formal language" ) elif choice == 'About': st.subheader("I hope you enjoy it and use to learn something") st.subheader("Built with Streamlit and Textblob") #st.write("Problems:") #st.write(" - sometimes the original language can't be correctly detected") #st.write(" - sometimes the sound will fail.") st.subheader("by Silvio Lima") if st.button("Linkedin"): js = "window.open('https://www.linkedin.com/in/silviocesarlima/')" html = '<img src onerror="{}">'.format(js) div = Div(text=html) st.bokeh_chart(div) else: # Audioplay st.subheader("Text Area") texto_default = 'Text' raw_text = st.text_area("Copy&Paste -> Ctrl+Enter", texto_default) blob = TextBlob(raw_text) try: if (raw_text == texto_default or raw_text == " " or raw_text == " " or raw_text == " " or raw_text == " "): st.error("Please write something in the text area") else: dict_idioma_full = lista_idiomas_full() idioma_original = get_value(blob.detect_language(), dict_idioma_full) original_key = get_key(idioma_original, dict_idioma_full) st.success("Original Language" + ": " + idioma_original + " (" + original_key + ")") play(raw_text, original_key) dict_idioma = lista_idiomas(idioma_original) options = st.multiselect("Choose a language", tuple(dict_idioma.values())) for i in range(len(options)): value = options[i] idioma_final_key = get_key(value, dict_idioma) try: if (idioma_original != idioma_final_key): texto_convertido = str( blob.translate(to=idioma_final_key)) st.success("Language" + ": " + value + " (" + idioma_final_key + ")") st.text(texto_convertido) play(texto_convertido, idioma_final_key) except: st.error( "ERROR: some languages will fail to play the sound." ) except: st.error( "ERROR: text must be at least 3 letters and the word must exist in the formal language" )
from textblob import TextBlob text = input("Enter text here=> ") obj = TextBlob(text) print("Detecting language\n", obj.detect_language()) print("Translate to") print("1. bengali\t 2.chinese \t 3. german\t 4. gujrati\t 5. japanese:") to = int(input("Enter your choice=> ")) if 5 < to < 1: print("Wrong choice") exit() elif to == 1: to = 'bn' elif to == 2: to = 'zh' elif to == 3: to = 'de' elif to == 4: to = 'gu' else: to = 'jv' print(obj.translate(to=to))
sent = TextBlob("I haawve goood speling") correct_sent = sent.correct() w = Word("haave") spellcheck = w.spellcheck() #Get Word and Noun Phrase Frequencies words = TextBlob('We are no longer together. We are enemies now.') word_counts = words.word_counts #You can specify whether or not the search should be case-sensitive (default is False). #Translation and Language Detection en_blob = TextBlob("You are my best friend") pl_blob = en_blob.translate(to='pl') blob = TextBlob("Mam na imię Piotr") detected_lang = blob.detect_language() #Parsing text = TextBlob('I know You') text_parse = text.parse() #string text = TextBlob("Hello World") upper_text = text.upper() find_world = text.find("World") #ngrams blob = TextBlob("Now is better than never.") ngram = blob.ngrams(n=3)
public_tweets = api.search(key_word) for tweet in public_tweets: print("\n\n") tweet_text = clean_tweet(tweet.text) tType = tweetType(tweet_text) if(tType == 'Retweet'): tweet_text = tweet_text.replace("RT ","") analysis = TextBlob(tweet_text) if(analysis.detect_language() != 'en' and len(tweet_text) > 3): lang = analysis.detect_language() try: analysis.translate(from_lang=lang ,to='en') except: pass print("Tweet : ",tweet_text) print() print("Result of sentiment analysis : ",end="") if(analysis.sentiment.polarity > 0): print("Happy") elif(analysis.sentiment.polarity == 0): print("Neutral")
import pandas as pd df = pd.read_csv("liveChatData.csv") df.describe() ## Top 50 Most active users top = df.Author.value_counts().head(50) a = df[df.Author == "trevor wasike"].reset_index(drop=True)["Message"] from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(max_features=20, stop_words="english") X = vectorizer.fit_transform(a) print(vectorizer.get_feature_names()) from textblob import TextBlob from textblob.exceptions import TranslatorError t1 = TextBlob(a[1]) t1.detect_language() t1.translate(to="en") from langdetect import detect
def call_from_here(text): resp = client.message(text) # print(resp) # print('entity value is {}'.format(list(resp['entities'])[0])) entity = None value = None timeit = 0 subject = '' try: entity = list(resp['entities'])[0] value = resp['entities'][entity][0]['value'] # print(entity,value) # time.sleep(.10) if entity != None and value != None: if entity == 'niku': # print('in niku') if value == 'read': print('reading') data = startread() speakit(data) if entity == 'translate': # print(entity,value) if value == 'tamil': data = startread() if data == '': os.system('aplay notification_sound/error.wav') speakit('sorry,no data selected') print('/n/n sorry,no data selected') else: speakit('trying to translate selected text in tamil') translate = TextBlob(data) try: translate_lang = translate.detect_language() # print('deteced language is ',translate_lang) if translate_lang == 'ta': speakit( 'the selected language is already in tamil' ) else: speakit('translating data from {}'.format( translate_lang)) translated_data = translate.translate( from_lang=translate_lang, to='ta') translated_data = str(translated_data) # print(translated_data) with open('translated.txt', 'w') as result_file: result_file.write(translated_data) os.system( 'aplay notification_sound/translating.wav' ) speakit( 'the data was translated sucessfully') print( '/n /n the data was translated sucessfully and saved in translated.txt /n/n' ) # niku.trans_text.setText(translated_data) except Exception as e: os.system('aplay notification_sound/error.wav') speakit( 'sorry some error happend while translating') print( 'sorry some error happend while translating the error is', e) elif entity == 'datetime' or entity == 'subject': timeit, subject = set_reminder(resp) elif entity == 'open': open_software(value) elif entity == 'show': window_movement(value) elif entity == 'weather': data = weather_api(value) speakit('the weather in {} is {}'.format(value, data)) print('the weather in {} is {}'.format(value, data)) elif entity == 'search_niku': ddgs(value) # # check_reminder(timeit,subject) # else: # print(text) # call_correct(text) except IndexError: # print(resp) # print(text) text = text.lower() call_correct(text)
# - *- coding: utf- 8 - *- from textblob import TextBlob word1 = TextBlob("thank you for using this") lang = word1.detect_language() z = word1.translate(from_lang='en', to='hi') print(lang) print(z) # first install textblob library for using this # pip install textblob # encoding format should be exact and placed at beginning of file
def detect_language(text): input = TextBlob(text) language= input.detect_language() return language
import requests import json import textblob from textblob import TextBlob cidade = input('Informe sua cidade: ') req = requests.get('http://api.openweathermap.org/data/2.5/weather?q=' + cidade + '&APPID=574708452380626a25e411bfeab9dd7a') #print(req.text) #Transforma em dicionário python tempo = json.loads(req.text) #Traduzindo retorno condicao_us = TextBlob(tempo['weather'][0]['main']) condicao_us.detect_language() condicao_traduzida = condicao_us.translate(to="pt_br") print('Condição do tempo:', condicao_traduzida) #Convertendo de Kelvin para Celsius print('Temperatura: ', float(tempo['main']['temp']) - 273.15, '°C')
india_trends = api.trends_place(2282863, )[0]['trends'][:25] text_list = [] hashtags_list = [] message = '' try: for i in india_trends: hashtag = str(i['name']) if i['name'][0] == '#': temp = str(i['name'])[1:] else: temp = str(i['name']) lang = TextBlob(temp) if lang.detect_language() == 'en': driver.get( "https://news.google.com/topstories?hl=en-IN&gl=IN&ceid=IN:en") time.sleep(5) google_news = driver.find_element_by_xpath("//a[@title='News']") action = webdriver.common.action_chains.ActionChains(driver) action.move_to_element_with_offset(google_news, 500, 0) action.click() action.send_keys(temp) action.send_keys(Keys.ENTER) action.perform() time.sleep(5) bodyText = driver.find_element_by_tag_name("body").text if 'No results found.' not in bodyText: try: link_element = driver.find_element_by_xpath(
def main(): """Ouça e Fale App """ st.title("Reader & Voice") activities = ["Home","PDF","TXT","About"] choice = st.sidebar.radio("Home",activities) if choice == 'Home': st.write("Only files:") st.markdown("### PDF or TXT") st.write("After uploading you can convert to 7 languages") st.markdown("### English, Spanish, French, Italian, Japanese, Russian and Chinese") #st.write("Definitions") #st.write("PCA is not a statistical method to infer parameters or test hypotheses. Instead, it provides a method to reduce a complex dataset to lower dimension to reveal sometimes hidden, simplified structure that often underlie it.") #st.write("") #st.write("PCA is a statistical method routinely used to analyze interrelationships among large numbers of objects.") #st.write("") #st.write("Principal component analysis (PCA) is a mathematical algorithm that reduces the dimensionality of the data while retaining most of the variation in the data set.") if choice == 'PDF': file = carregar_texto('pdf') pdf = pdftotext.PDF(file) #for page in pdf: # st.text(page) blob = TextBlob(pdf[0]) st.text(blob) st.write(blob.detect_language()) #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(raw_text,original_key) #dict_idioma = lista_idiomas(idioma_original) #options = st.multiselect("Choose a language", tuple(dict_idioma.values())) #for i in range(len(options)): # value = options[i] # idioma_final_key = get_key(value, dict_idioma) # try: # if (idioma_original != idioma_final_key): # texto_convertido = str(blob.translate(to=idioma_final_key)) # st.success("Language"+": "+ value + " ("+idioma_final_key+")") # st.write(texto_convertido) # #st.text(idioma_final_key) # play(texto_convertido,idioma_final_key) # # except: # st.error("ERROR: some languages will fail to play the sound.") #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(blob,original_key) #convert(blob) #except: # st.warning("PDF please") if choice == 'TXT': try: file = carregar_texto('txt') blob= TextBlob(file.getvalue()) st.markdown(blob) #dict_idioma_full = lista_idiomas_full() #idioma_original = get_value(blob.detect_language(),dict_idioma_full) #original_key = get_key(idioma_original, dict_idioma_full) #st.success("Original Language"+": "+ idioma_original + " ("+original_key+")") # Original sound #play(file.getvalue(),original_key) #st.write(blob.detect_language()) #st.subheader(blob) convert(file, blob) #dict_idioma = lista_idiomas(idioma_original) #options = st.multiselect("Choose a language", tuple(dict_idioma.values())) #for i in range(len(options)): # value = options[i] # idioma_final_key = get_key(value, dict_idioma) # try: # if (idioma_original != idioma_final_key): # texto_convertido = str(blob.translate(to=idioma_final_key)) # st.success("Language"+": "+ value + " ("+idioma_final_key+")") # st.write(texto_convertido) # #st.text(idioma_final_key) # play(texto_convertido,idioma_final_key) # # except: # st.error("ERROR: some languages will fail to play the sound.") except: st.warning("TXT please")
def recognize_languages(self, files): for i in range(len(files)): blob = TextBlob(files[i]) self.languages[blob.detect_language()] += [i]
!python -m textblob.download_corpora from textblob import TextBlob #import nltk #nltk.download() tx = df.loc[0,'full_text'] blob = TextBlob(tx) blob.tags blob.sentences[0].words blob.noun_phrases blob.ngrams(3) blob.correct() blob.words[3].spellcheck() blob.detect_language() blob.translate(to= 'ar') verbs = list() for word, tag in blob.tags: if tag == 'VB': verbs.append(word.lemmatize()) nouns = list() for word, tag in blob.tags: if tag == 'NN': nouns.append(word.lemmatize()) blob.sentiment.polarity blob.sentiment.subjectivity
@author: akansal2 """ #importing libraies from textblob import TextBlob #TextBlob Strings Str1 = TextBlob('Amazing') Str2 = TextBlob('Spider Man') #Textblob string operations Str1.lower() Str1.upper() Str1[1:4] Str1 + " " + Str2 Str1.detect_language() #Paragraph and sentence operations para = TextBlob("My name is aditya. \n I live is Modinagar.\n My apples id is [email protected]") para.sentences # distinguish sentences with combination of . and \n para.sentences[0] para.sentences[1] para.sentences[2] para.sentences[0].words for n in para.sentences[1].noun_phrases: print(n) for t in para.sentences[1].tags: print(t)
def main(): """NLP App with Streamlit and TextBlob""" #st.title("NLP Simple Examples") title_templ = """ <div style="background-color:blue;padding:8px;"> <h1 style="color:cyan">NLP Simple Examples</h1> </div> """ st.markdown(title_templ,unsafe_allow_html=True) subheader_templ = """ <div style="background-color:cyan;padding:8px;"> <h3 style="color:blue">Natural Language Processing On the Go...</h3> </div> """ st.markdown(subheader_templ,unsafe_allow_html=True) st.sidebar.image("https://www.centreofexcellence.com/app/uploads/2016/09/nlp-diploma-course.jpg", use_column_width=True) activity = ["Text Analysis", "Translation", "Sentiment Analysis", "About"] choice = st.sidebar.selectbox("Menu",activity) # Text Analysis CHOICE if choice == 'Text Analysis': st.subheader("Text Analysis") st.write("") st.write("") raw_text = st.text_area("Write something","Enter a Text in English...",height=250) if st.button("Analyze"): if len(raw_text) == 0: st.warning("Enter a Text...") else: blob = TextBlob(raw_text) st.write("") if blob.detect_language() != 'en': st.warning("Enter a Text in English...") else: st.info("Basic Functions") col1, col2 = st.beta_columns(2) with col1: with st.beta_expander("Basic Info"): st.success("Text Stats") word_desc = nt.TextFrame(raw_text).word_stats() result_desc = {"Length of Text":word_desc['Length of Text'], "Num of Vowels":word_desc['Num of Vowels'], "Num of Consonants":word_desc['Num of Consonants'], "Num of Stopwords":word_desc['Num of Stopwords']} st.write(result_desc) with st.beta_expander("Stopwords"): st.success("Stop Words List") stop_w = nt.TextExtractor(raw_text).extract_stopwords() st.error(stop_w) with col2: with st.beta_expander("Processed Text"): st.success("Stopwords Excluded Text") processed_text = str(nt.TextFrame(raw_text).remove_stopwords()) st.write(processed_text) with st.beta_expander("Plot Wordcloud"): st.success("Wordcloud") plot_wordcloud(raw_text) st.write("") st.write("") st.info("Advanced Features") col3, col4 = st.beta_columns(2) with col3: with st.beta_expander("Tokens&Lemmas"): st.write("T&L") processed_text_mid = str(nt.TextFrame(raw_text).remove_stopwords()) processed_text_mid = str(nt.TextFrame(processed_text_mid).remove_puncts()) processed_text_fin = str(nt.TextFrame(processed_text_mid).remove_special_characters()) tandl = text_analyzer(processed_text_fin) st.json(tandl) with col4: with st.beta_expander("Summarize"): st.success("Summarize") summary_text = summarize(raw_text,ratio=0.4) if summary_text != "": st.success(summary_text) else: st.warning("Please insert a Longer Text") # Translation CHOICE elif choice == 'Translation': st.subheader("Text Translation") st.write("") st.write("") raw_text = st.text_area("","Write something to be translated...") if len(raw_text) < 3: st.warning("Please provide a string with at least 3 characters...") else: blob = TextBlob(raw_text) lang = blob.detect_language() #st.write(lang) tran_options = st.selectbox("Select translation language",['Chinese', 'English', 'German', 'Italian', 'Russian', 'Spanish']) if st.button("Translate"): if tran_options == 'Italian' and lang != 'it': st.text("Translating to Italian...") tran_result = blob.translate(from_lang=lang, to='it') elif tran_options == 'Spanish' and lang != 'es': st.text("Translating to Spanish...") tran_result = blob.translate(from_lang=lang, to='es') elif tran_options == 'Chinese' and lang != 'zh-CN': st.text("Translating to Chinese...") tran_result = blob.translate(from_lang=lang, to='zh-CN') elif tran_options == 'Russian' and lang != 'ru': st.text("Translating to Russian...") tran_result = blob.translate(from_lang=lang, to='ru') elif tran_options == 'German' and lang != 'de': st.text("Translating to German...") tran_result = blob.translate(from_lang=lang, to='de') elif tran_options == 'English' and lang != 'en': st.text("Translating to English...") tran_result = blob.translate(from_lang=lang, to='en') else: tran_result = "Text is already in " + "'" + lang + "'" st.success(tran_result) # Sentiment Analysis CHOICE elif choice == 'Sentiment Analysis': st.subheader("Sentiment Analysis") st.write("") st.write("") raw_text = st.text_area("", "Enter a Text...") if st.button("Evaluate"): if len(raw_text) == 0: st.warning("Enter a Text...") else: blob = TextBlob(raw_text) lang = blob.detect_language() if lang != 'en': tran_result = blob.translate(from_lang=lang, to='en') blob = TextBlob(str(tran_result)) result_sentiment = blob.sentiment st.info("Sentiment Polarity: {}".format(result_sentiment.polarity)) st.info("Sentiment Subjectivity: {}".format(result_sentiment.subjectivity)) # About CHOICE else:# choice == 'About': st.subheader("About") st.write("") st.write("") st.markdown(""" ### NLP Simple Examples (App with Streamlit and TextBlob) ##### By + **[Rosario Moscato LAB](https://www.youtube.com/channel/UCDn-FahQNJQOekLrOcR7-7Q)** + [[email protected]](mailto:[email protected]) """)
def instaBot(self, c_id, c_secret, file, duration): "Retrieve the Instagram posts and analyze them" api = InstagramAPI(client_id=c_id, client_secret=c_secret) posts, next = api.tag_recent_media(tag_name='food', count=30) temp, max_tag = next.split('max_tag_id=') max_tag = str(max_tag) stop = time.time() + duration * 60 while time.time() < stop: print "[*] " + str(len(posts)) + " posts retrieved." for post in posts: if self.isNewPost("log/posts.log", post.id): count = 0 langs = {} print post.id for tag in post.tags: tagName = TextBlob(tag.name) tagName = tagName.words[0].singularize() if len(tagName) >= 3 and tagName != 'food': try: lang = tagName.detect_language() except: print "[-] Fail to detect the language." continue print "[*] " + tagName, '->', lang langs.setdefault(lang, 0) langs[lang] += 1 if lang != 'en': try: tagName = tagName.translate(from_lang=lang, to='en') except: print "[-] Fail to translate the tag." continue print "[*] Traduction: ", tagName tagRelatedToFood = self.isTagRelatedToFood(tagName) if tagRelatedToFood: count += 1 print "[+] Tag related to food." elif tagRelatedToFood == False: print "[-] Tag not related to food." else: # tagRelatedToFood == None if self.isRelatedTo(tagName, self.foodWords): count += 1 self.updateTags(self.foodTagsFile, tagName) self.writeTagLog("log/newTags.log", tag, True) print "[+] Tag related to food." else: self.updateTags(self.noFoodTagsFile, tagName) self.writeTagLog("log/newTags.log", tag, False) print "[-] Tag not related to food." if count > 0: self.savePost(file, post) self.writePostLog("log/posts.log", post, langs, True) print "[+] Post saved." else: self.writePostLog("log/posts.log", post, langs, False) print "[-] Post forget." print '-------------------' posts, next = api.tag_recent_media(tag_name='food', max_tag_id=max_tag) temp, max_tag = next.split('max_tag_id=') max_tag = str(max_tag) if not next: break
def simple_identification(): client_from = MongoClient() db_from = client_from["SSD"] coll_from = db_from["raw_data"] start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - DetectEmojisWithSpacymoji_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='a') p_file.write(date + " Detecting Emojis with Spacymoji Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep # Read complementary characters complementary_characters_list_file = codecs.open( path + "list - complementary_characters.txt", encoding='utf-8') complementary_characters_list = complementary_characters_list_file.read( ).splitlines() complementary_characters_list_file.close() complementary_characters_dict = {} for aux in complementary_characters_list: aux_char = aux.split('\t') complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]] # print(complementary_characters_dict) # 3. Configure Spanish POS tagger spanish_pipeline = spacy.load('es') emoji = Emoji(spanish_pipeline) spanish_pipeline.add_pipe(emoji, first=True) tag_map = spacy.es.TAG_MAP # start all_from_tweets = coll_from.find() count = 0 stop = 1000 p_file.write("Total data to process: " + str(stop) + "\n") for raw_data in all_from_tweets: if 'text' in raw_data.keys() and 'lang' in raw_data.keys(): if "place" in raw_data.keys(): place = raw_data["place"] if place is not None: if "country_code" in place.keys(): raw_data_country_code = raw_data["place"][ "country_code"] if raw_data_country_code in ["CO"]: lang = raw_data["lang"] text = raw_data['text'] if lang == 'es': identify_special_characters( text, spanish_pipeline, tag_map, p_file) count += 1 else: if len(text) >= 3: blob = TextBlob(text) detection = True detected_language = '' while detection: try: detected_language = blob.detect_language( ) detection = False except: print( 'error while getting detected language' ) if detected_language == 'es': identify_special_characters( text, spanish_pipeline, tag_map, p_file) count += 1 print(count) if count == stop: break all_from_tweets.close() client_from.close() p_file.write("Total elements in new list: " + str(count) + "\n") execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") p_file.flush() p_file.close()
os.system('say "{}"'.format(textLower)) wav = gTTS(text=textLower, lang='en') wav.save("KYC.wav") translator = Translator() print(translator.translate(text, dest='zh-CN').text) ''' “了解您的客户”表格是投资行业的标准表格,可确保投资顾问了解客户的风险承受能力,投资知识和财务状况的详细信息。 KYC表格保护客户和投资顾问。 ''' text = TextBlob(text) print(text.detect_language()) print(text.sentiment) print(text.translate(to='ja')) ''' en Sentiment(polarity=0.13333333333333333, subjectivity=0.25) 「あなたのクライアントを知る」フォームは、投資顧問会社が顧客のリスク許容度、投資知識および財政状態に関する詳細な情報を確実に把握する、投資業界の標準的なフォームです。 KYCフォームは、クライアントと投資顧問の両方を保護します。 ''' # Convert to WAV of FLAC with format check, then load and recognize for compatibility assurance try: wav = wave.open("KYC.wav", 'r')
sentence = TextBlob('Use 4 spaces per indentation level.') print(sentence.words[2].singularize()) #similarly you can use pluralize print() #word lemmatization w = Word("octopi") print("octopi -> ",w.lemmatize()) w = Word("went") print("went -> ",w.lemmatize("v")) print() #definition print("Octopus : ",Word("octopus").definitions) print() #translation and language detection en_blob = TextBlob(u'Simple is better than complex.') print('Simple is better than complex.') print("SPANISH : ",en_blob.translate(to='es')) en_blob = TextBlob(u'Comment allez vous?') print('Comment allez vous?') print("language : ",en_blob.detect_language()) print() #spell-check w = Word("banama") print("banama") print("correction : ",w.correct()) print("suggestions : ",w.spellcheck()) print()
adj_ctr += 1 adj.append(pair[0]) elif tag == 'NN' or tag == 'NNS': noun_ctr += 1 noun.append(pair[0]) print('\nTotal number of adjectives in tweet collection= ', adj_ctr) print("List of adjectives in tweet collection: ", adj) print('\nTotal number of nouns in tweet collection= ', noun_ctr, noun) print("List of nouns in tweet collection: ", noun) #working on string entered manually st = "I amm an ostrich and nobody can see me, not even I myself." st2 = "It's so sad that turtles can only walk slow." st3 = "Not being smart and amazing is not worst thing in the world." blob = TextBlob(st) blob2 = TextBlob(st2) blob3 = TextBlob(st3) print('\nOriginal string: ', st) print('Spell checked string: ', blob.correct()) print("Detecting language in above sentence...", blob.detect_language()) print(blob.translate(to='hi')) if blob2.sentiment.polarity > 0: print("\n'", st2, "'", 'is positive') else: print("\n'", st2, ",", 'is negative') if blob3.sentiment.polarity > 0: print("\n'", st3, "'", 'is positive') else: print("\n'", st3, "'", 'is negative')
#message = 'Persona Natural' #context = [1, 'Jubilacion Patronal', -0.024999999999999994, 'cuanto cuesta un estudio actuarial para una empresa pequena'] context = [0, None, None, None, None, 0] out_message, context = proc_message(message, context) out_message context out_message, context = proc_message(message, context) out_message message = 'No' blob = TextBlob(message) if blob.detect_language() != 'en': blob = blob.translate(to='en').lower() #def set_greeting_text(self, text): # data = {"setting_type": "greeting", "greeting": {"text": text}} # fmt = self.graph_url + "me/thread_settings?access_token={token}" # return requests.post(fmt.format(token=self.access_token), # headers={"Content-Type": "application/json"}, # data=json.dumps(data)) topics = ['jubilacion patronal', 'consultoria', 'recursos humanos', 'IESS'] rep = 'su tema {0} es el que tratamos' print(rep.format(topics[0]))
from textblob import TextBlob eb = TextBlob('Meu coraçao bate feliz quando te ve.') print(eb.detect_language())
def readfile(country, language): df = pd.read_json('/home/saad/Data/Twitter/country/' + country + '/stats/data.json') words = [ 'flat', 'curv', 'distance', 'lockdown', 'lock', 'pandamic', 'safe', 'quaran', 'social distan', 'social_distan', 'distancing', 'stay', 'remote', 'home', 'indoor' ] trans_word = [] for j in range(0, len(words)): try: t_word = translator.translate([words[j]], dest=language) for translation in t_word: trans_word.append() except: continue tweets = list(df['tweet']) countries = list(df['country']) trends = list(df['trend']) retweets = list(df['retweets']) favorites = list(df['favorites']) dates = list(df['date']) dfObj = {} dfObj['Date'] = [] dfObj['Tweet'] = [] for i in range(0, len(tweets)): a = 0 b = 0 c = 0 if trends[i] != 'roadsafety' and (any(word in trends[i] for word in words) or any(word in trends[i] for word in trans_word)): a = 1 try: blob = TextBlob(tweets[i]) lang = blob.detect_language() if (lang != 'en'): print("doing") tweets[i] = blob.translate(from_lang=lang, to='en') dfObj['Date'].append(dates[i]) dfObj['Tweet'].append(tweets[i]) except: continue # dfObj['Date'].append(dates[i]) # dfObj['Tweet'].append(tweets[i]) if any(word in tweets[i] for word in words) or any(word in tweets[i] for word in trans_word): b = 1 try: blob = TextBlob(tweets[i]) lang = blob.detect_language() if (lang != 'en'): print("doing") tweets[i] = blob.translate(from_lang=lang, to='en') dfObj['Date'].append(dates[i]) dfObj['Tweet'].append(tweets[i]) except: continue if (a == 1) and (b == 0): try: blob = TextBlob(tweets[i]) lang = blob.detect_language() if (lang != 'en'): print("doing") tweets[i] = blob.translate(from_lang=lang, to='en') dfObj['Date'].append(dates[i]) dfObj['Tweet'].append(tweets[i]) except: continue ds = dict(Date=np.array(dfObj['Date']), Tweet=np.array(dfObj['Tweet'])) ds = pd.DataFrame({key: pd.Series(value) for key, value in ds.items()}) ds['Date'] = pd.to_datetime(ds['Date']) ds = ds.sort_values(by='Date') ds.to_pickle("/home/saad/Data/Twitter/country/" + country + "/stats/nlpmood.pkl")
#partnerfunds.com for i in range(0, len(test)): siteurl = str(test[i][0]) text = str(test[i][1]) #Cortical.io termKeyWords = client.extractKeywords(text) termBitmap = client.getTextBitmap(text)['fingerprint']['positions'] #TextBlob blob = TextBlob(text) MySqlKeyWordDat = (','.join(termKeyWords), siteurl) MySqlBitMapDat = (str(termBitmap), siteurl) MySqlTextBlobDat = (str(blob.sentiment), siteurl) MySqLangDat = (str(blob.detect_language()), siteurl) print "---For "+siteurl+" keywords = " + ",".join(termKeyWords) + " sentiment = " + MySqlTextBlobDat[0] + " lang:" + MySqLangDat[0] MySqlKeyWordDatQ = """UPDATE """+dbtable+""" SET cortical_io_keywords = %s WHERE siteurl = %s""" MySqlBitMapDatQ = """UPDATE """+dbtable+""" SET cortical_io = %s WHERE siteurl = %s""" MySqBlobDatQ = """UPDATE """+dbtable+""" SET opencalais = %s WHERE siteurl = %s""" MySqLangDatQ = """UPDATE """+dbtable+""" SET watson = %s WHERE siteurl = %s""" #upload keywords and bitmap to database cur.execute(MySqlKeyWordDatQ, MySqlKeyWordDat) cur.execute(MySqlBitMapDatQ, MySqlBitMapDat) cur.execute(MySqBlobDatQ, MySqlTextBlobDat) cur.execute(MySqLangDatQ, MySqLangDat) con.commit()
def detect(): word = TextBlob(text_in.get('1.0', 'end')) if len(word) > 2: label_detected_lang.configure(text=lang_dict_rev[word.detect_language()].upper()) else: label_detected_lang.configure(text='')
# text=parsed["content"] # print(text) # print("\n") #London is the capital. And most populous city. England. text = sys.argv[1] #"Parlez-vous anglais?" outfile = "output.txt" f = open(outfile, "w") f.write(str(text)) f.close() text1 = open("output.txt") text1 = text1.read() translation = TextBlob(text1) if translation.detect_language() != 'en': en_blob = translation.translate(to='en') text1 = en_blob text1 = str(text1) #text="Parlez-vous anglais? London is the capital and most populous city of England and the United Kingdom. Today Machine learning (ML) is the scientific study of algorithms in statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data in India, known as training data, in order to make predictions or decisions for Google without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of google email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics. Manchester United is a very famous football club." doc = nlp(text1) # Add some Stop Words my_stop_words = [':', '.', ',', '-', '(', ')', '"', ' '] for stopword in my_stop_words: add_word = nlp.vocab[stopword] add_word.is_stop = True #Printing After Removing StopWords stopwords = list(STOP_WORDS)
from textblob import TextBlob from textblob.exceptions import TranslatorError import numpy as np import pandas as pd df=pd.read_csv("liveChatData.csv") language=[] msgs=df.Message for i in range(len(msgs)): try: t1=TextBlob(msgs[i]) lan=t1.detect_language() language.append(lan) except (TypeError,TranslatorError): lan=np.nan language.append(lan) print("Translator Error") print(i) df["Language"]=language df.to_csv("langDf.csv")
def is_english(t): t = t.replace("#", "") chk = TextBlob(t) if chk.detect_language() == 'en': return True return False
from textblob import TextBlob input_str = input('\n enter the string:') textblob_obj = TextBlob(input_str) detectLanguage = textblob_obj.detect_language() print('\n detected language', detectLanguage) input_string = input('\n enter the text') textblob_obj_2 = TextBlob(input_string) arabic_op = textblob_obj_2.translate(to='ar') print('\n ip string is converted into arabic_op:', arabic_op) china_corona = textblob_obj_2.translate(to='zh-CN') print('\n ip string is converted into china_corona:', china_corona) french_op = textblob_obj_2.translate(to='fr') print('\n ip string in french:', french_op) greekop = textblob_obj_2.translate(to='el') print('\n ip is converted to greek', greekop) hindi_op = textblob_obj_2.translate(to='hi') print('\n ip is converted into hindi', hindi_op)