def detectLanguage(): """! Function is detecting language (is it croatian or alike) of a newly created commentsFile. """ print "Analyzing " + str( len(cFiles) + len(pyFiles) ) + " files. This may take a while." failed = [] with codecs.open(commentsFile, "r", encoding='utf8') as f: lineNum = 1 for line in f: lineNum += 1 if len(line) < 10 or "author" in line: pass else: try: if detect(line.encode('utf-8')) == 'hr' or detect(line.encode('utf-8')) == 'sl': failed.append(line) except: pass print "Found " + str(len(failed)) + " suspicious comments: " for fail in failed: print fail print "Finding suspicious comments...done." # erase contents of the comments file? # maybe delete the file? open(commentsFile, 'w').close()
def get_youtube_comments(link): o = urlparse(link) query = o.query.split('&') videoID = query[0].replace('v=', '') youtube = get_authenticated_service(videoID) # All the available methods are used in sequence just for the sake of an example. text = '' video_comment_threads = get_comment_threads(youtube, videoID) for thread in video_comment_threads: topComment = thread["snippet"]["topLevelComment"] cmt = topComment["snippet"]["textDisplay"] + '\n' if detect(cmt) == 'en': text += cmt parent_id = thread["id"] video_comments = get_comments(youtube, parent_id) for child_comments in video_comments: cmt = child_comments["snippet"]["textDisplay"] + '\n' try: if detect(cmt) == 'en': text += cmt except: text += '' return text
def get_lang(article_str): lang = "not_detected" try: lang = detect(article_str) except UnicodeDecodeError: lang = detect(article_str.decode("UTF-8")) except: "Not Detected = " + article_str return lang
def on_success(self, data): if ('text' in data) and (detect(data['text'])=='en'): rmLinkRegex = re.sub(r"(?:\@|https?\://)\S+", '', data['text'], flags=re.MULTILINE) rmNonAscii = re.sub(r'[^\x00-\x7F]+',' ', rmLinkRegex) print(rmNonAscii) print('\n') print('*****',detect(data['text']),"\n") getSpeech(rmNonAscii) time.sleep(1)
def detectEmailLanguage(m): b = m['body'].decode("utf-8", "ignore") s = m['subject'] try: return detect(b) except: try: return detect(s) except: return "en"
def _language(self, item): """Returns the language of the extracted article by analyzing metatags and inspecting the visible text with langdetect""" response = item['spider_response'].body root = html.fromstring(response) # Check for lang-attributes lang = root.get('lang') if lang is None: lang = root.get('xml:lang') # Check for general meta tags if lang is None: meta = root.cssselect('meta[name="language"]') if len(meta) > 0: lang = meta[0].get('content') # Check for open graph tags if lang is None: meta = root.cssselect('meta[property="og:locale"]') if len(meta) > 0: lang = meta[0].get('content') # Look for <article> elements and inspect the one with the largest payload with langdetect if lang is None: article_list = [] for article in root.xpath('//article'): article_list.append(re.sub(r'\s+', ' ', article.text_content().strip())) if len(article_list) > 0: lang = detect(max(article_list)) # Analyze the whole body with langdetect if lang is None: try: lang = detect(root.text_content().strip()) except LangDetectException: pass # Try to normalize output if lang is not None: # First search for suitable locale in the original output matches = self.langcode_pattern.search(lang) if matches is not None: lang = matches.group(0) else: # If no match was found, normalize the original output and search again normalized = locale.normalize(re.split(r'\s|;|,', lang.strip())[0]) matches = self.langcode_pattern.search(normalized) if matches is not None: lang = matches.group(0) return lang
def check(wb, tb): if len(wb[0]) <= 1 or len(wb[1]) <= 2: return False try: if detect(wb[0]) != "ar" or detect(wb[1]) != "ar": return False except: return False if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]: return True return False
def get_words(text): if not text.decode('utf-8'): return None if not (detect(text.decode('utf-8')) == u'en'): print detect(text.decode('utf-8')) return None text = text.replace(r',./\“”!@#$%^&*()-\'"+=`~:;?><', ' ') words = [porter.stem(word) for word in word_tokenize(text) if word.isalpha() and len(word) >= 3 and word not in stopwords.words('english')] return words
def detect_langs(corpus): global langs import langdetect for doc in corpus.view_contexts(corpus.context_types[-1], as_strings=True): lang = langdetect.detect(' '.join(doc)) return [lang]
def findPosts(user): posts = [] # TWITTER if('twitterId' in user): CONSUMER_KEY = 'p05WZVs4JivX4a0WSwFyMXXCo' CONSUMER_SECRET = 'DghsY9Dxn2X8xAjdQKEvwBLtqsHNJabFz361pz2ZvRmAgXiPHB' ACCESS_KEY = '167813147-LwEOQAqO6RCnK0GfIEXNeVOng93QHkW1iFuVjBUV' ACCESS_SECRET = 'kpzp3quxTmVpSfWdgcyN5qbrPTmyoFArdvJeUC4Dfjtg1' twitter = Twython(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_KEY,ACCESS_SECRET) user_timeline = twitter.get_user_timeline(screen_name=user['twitterId'], count=100, include_retweets=False) for tweet in user_timeline: tweet_utf = removeAccents(unicode(tweet['text'].encode('utf-8'))) if(detect(tweet_utf) == 'pt'): tweet_id = unicode(str(tweet['id'])) if(tweet_id and tweet_utf): posts.append((tweet_id, tweet_utf, u'Post', u'Twitter')) # FACEBOOK if('facebookId' in user): app_id = "704203256284579" app_secret = "9a75ef350e4f9b24d8be454abf29ae68" access_token = facebook.GraphAPI().get_app_access_token(app_id, app_secret) graph = facebook.GraphAPI(access_token) profile = graph.get_object(user['facebookId']) f_posts = graph.get_connections(profile['id'], 'posts') for f_post in f_posts['data']: if 'message' in f_post and len(f_post['message']) > 10: posts.append((f_post['id'], f_post['message'], u'Post', u'Facebook')) if 'description' in f_post and len(f_post['description']) > 10: posts.append((f_post['id'], f_post['description'], u'Post', u'Facebook')) return posts
def get_description_language(content): """ Parameters ------------- content: bs4.element.tag element that contains the description data. Returns ------------- str: document description and document language. """ # There might be other ways they store descriptions, might need to add symbols possibilities = ['blockquote', 'p'] description = None for tag in possibilities: description = content.find(tag) if description is not None: break if description is None: return 'No description', 'None' description = description.getText() if description == '': return 'No description', 'None' else: try: return description, detect(description) except LangDetectException: return 'No description', 'None'
def analyze_font(self, fontid, samples): sampletext = "" # very involved way of getting a representative sample, since # an encoded font can be partially unencoded... for textbox in samples: decode_all = not('i' in [getattr(x, 'tag', None) for x in textbox]) if decode_all: sampletext += etree.tostring(textbox, method="text", encoding="utf-8").decode("utf-8") else: for subpart in textbox: if (isinstance(subpart, etree._Element) and (decode_all or subpart.tag == 'i')): if subpart.text: # it might be None, for eg "<i><b>text is in child instead</b></i>" sampletext += subpart.text for low_offset, high_offset, unmapped in ((0,0, []), (0x1d, 0x7a, []), (0x20, 0x40, [0x20])): if low_offset and high_offset: encodingmap = self.encodingmap(low_offset, high_offset, unmapped) decoded_sample = self.decode_string(sampletext, encodingmap) else: encodingmap = None decoded_sample = sampletext try: lang = detect(decoded_sample) if lang == 'sv': self.encodingmaps[int(fontid)] = encodingmap return low_offset # used for diagnostic logging except LangDetectException: pass raise errors.PDFDecodeError("cannot detect how to decode font %s using %r" % (fontid, sampletext))
def build_dictionary(review_page): #dictionary for each product product = {} #counter used to detect review for particular product counter = 0 #parsing the review page dom = parse(StringIO.StringIO(review_page)) #extracting the item tags name = dom.getElementsByTagName('item') #iterating over each item for child in name: #extracting the given title tag from the item title = child.getElementsByTagName('title')[0] #extracting the text title from the item text = child.getElementsByTagName('text')[0] #checking if the corresponding tag is present in the item if(title.hasChildNodes is not None and len(title.childNodes) > 0 and title.childNodes[0].data is not None and text.hasChildNodes is not None and len(text.childNodes) > 0 and text.childNodes[0].data is not None): key = title.childNodes[0].data value = text.childNodes[0].data #checking if the product is present in the dictionary if key in product: product[key].append(value) else: #if the product is not present, then adding that in the dictionary, based on the language if counter == 0: #langdect library used to detect the language of the particular review language = langdetect.detect(value) counter += 1 product[key] = [value] dictionary = (language, product) return dictionary
def valid_language(text): supported_languages = settings.LANGUAGE_DETECTION if supported_languages: lang = langdetect.detect(text) if lang not in supported_languages: raise ValidationError( 'Language "{0}" is not one of the supported languages {1}!'.format(lang, supported_languages))
def titles(self, key, value): def is_main_title(key): return key.startswith('245') def is_translated_title(key): return key.startswith('242') titles = self.setdefault('titles', []) values = force_force_list(value) for val in values: title_obj = { 'title': val.get('a'), 'subtitle': force_single_element(val.get('b')), # FIXME: #1484 'source': val.get('9'), } if is_main_title(key): titles.insert(0, title_obj) elif is_translated_title(key): title = val.get('a') if title: lang = langdetect.detect(title) if lang: title_obj['language'] = lang self.setdefault('title_translations', []).append(title_obj) else: titles.append(title_obj) return titles
def language_in_tweet(tweet): detected_lang = None try: detected_lang = detect(tweet['text']) except lang_detect_exception.LangDetectException: pass return any([detected_lang in args])
def analyze(s, language=None): # Detect language if not provided if language is None: language = detect(s) if language not in ["en"]: raise ValueError("Language "+language+" not supported") # Load pattern pattern = importlib.import_module("pattern." + language) # Perform analysis analysis = {} pt = pattern.parsetree(s) analysis["wordPerSentence"] = stats([len(s.words) for s in pt]) #Moods moods = Counter([pattern.mood(s) for s in pt]) tot=sum([v for k,v in moods.iteritems()]) analysis["moods"] = {} for k in moods.keys(): analysis["moods"][k] = round(float(moods[k])/tot*100) # analysis["modality"] = stats([pattern.modality(s) for s in pt]) sentiments = [pattern.sentiment(s) for s in pt] analysis["polarity"] = stats([s[0] for s in sentiments]) analysis["subjectivity"] = stats([s[1] for s in sentiments]) analysis["positivity"] = stats([int(pattern.positive(s)) for s in pt]) return analysis
def detectLanguage(sentence): try: lang = detect(sentence) return lang except Exception,e: print "--- ERROR detecting language ---" print e
def search_by_brand(self, brand): '''method to search by brand return the mall ids with results repartition''' self.collect_brands() if detect(brand) == "ja": if brand in self.brands["jap"].keys(): return self.search_mall_id(self.brands["jap"][brand]["url"]) else: brand_t = re.split("・|ー| |&", brand) for k,v in self.brands["jap"].items(): for t in v["tags"]: if t == brand: return(self.search_mall_id(self.brands["jap"][k]["url"])) for tag in brand_t: if tag == t: return(self.search_mall_id(self.brands["jap"][k]["url"])) else: if brand.lower() in self.brands["en"].keys(): return(self.search_mall_id(self.brands["en"][brand.lower()]["url"])) else: brand_t = re.split("・|ー| |&", brand) for k,v in self.brands["en"].items(): for t in v["tags"]: if t == brand: return(self.search_mall_id(self.brands["en"][k]["url"])) for tag in brand_t: if tag == t: return(self.search_mall_id(self.brands["en"][k]["url"]))
def _get_art_context(record): reader = LiteratureReader(record) abstract = reader.abstract try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': reader.arxiv_id, 'authors': get_authors(record), 'collaborations': reader.collaborations, 'divulgation': get_divulgation(record), 'doi': get_doi(record), 'domains': get_domains(record), 'inspire_id': get_inspire_id(record), 'journal_issue': get_journal_issue(record), 'journal_title': get_journal_title(record), 'journal_volume': get_journal_volume(record), 'keywords': reader.keywords, 'language': get_language(record), 'page_artid': get_page_artid(record), 'peer_reviewed': get_peer_reviewed(record), 'publication_date': get_publication_date(record), 'subtitle': reader.subtitle, 'title': reader.title, }
def _get_ocr(self, pngs): self._render(" OCRing the PDF", 2) raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) guessed_language = langdetect.detect(raw_text) self._render(" Language detected: {}".format(guessed_language), 2) if guessed_language not in ISO639: self._render("Language detection failed!", 0) if settings.FORGIVING_OCR: self._render("As FORGIVING_OCR is enabled, we're going to make the best " "with what we have.", 1) return raw_text raise OCRError if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: return raw_text try: return self._ocr(pngs, ISO639[guessed_language]) except pyocr.pyocr.tesseract.TesseractError: if settings.FORGIVING_OCR: self._render( "OCR for {} failed, but we're going to stick with what " "we've got since FORGIVING_OCR is enabled.".format(guessed_language), 0, ) return raw_text raise OCRError
def read_data(): """ INPUT: None OUTPUT: pandas data frame from file """ list_of_files = glob.glob('app/uploads/*.csv') # * means all if need specific format then *.csv latest_file = max(list_of_files, key=os.path.getctime) df = pd.read_csv(latest_file, skiprows = 12, usecols=range(0,12)) if df.Platform == 'iOS': keep = ['Date', 'App ID', 'App Name', 'User', 'Version', 'Rating', 'Review'] df = df[keep] df.columns = ['date', 'business_id', 'business_name', 'user_name', 'version', 'review_stars', 'text'] else: df = df[df.Language == 'English'] keep = ['Date', 'App Name', 'Publisher ID', 'User', 'Rating', 'Review'] df = df[keep] df.columns = ['date', 'business_name', 'business_id', 'user_name', 'review_stars', 'text'] for rev in df['text']: try: df['lang'] = detect(rev) except: pass df = df[df.lang == 'en'] return df
def is_english(s): """Predicate that estimates whether a given string is in English""" try: return langdetect.detect(s) == 'en' except: print("Couldn't detect the language of: {}".format(s)) return True
def get_lang(self): """ Detect language from the body. This method takes some time @return string lang can be 'fr' or 'en' """ lang = detect(self.get_body()) return lang
def _guess_language(self, text): try: guess = langdetect.detect(text) self.log("debug", "Language detected: {}".format(guess)) return guess except Exception as e: self.log("warning", "Language detection error: {}".format(e))
def translate_to_en(input_text): source = detect(input_text) translated = service.translations().list(q = input_text, target = 'en', source = source).execute() return (source, translated['translations'][0]['translatedText'])
def czech_filter(text): '''filter Czech text with usage of langdetect library. To filter other language simply change 'cs' to other value, e.g. 'de', 'sk', 'pl' etc. ''' if langdetect.detect(text) == 'cs': return True return False
def filter_russian(content_file_path, output_file_path): from langdetect import detect, lang_detect_exception line_count = 0 line_count_mod = 1 lang = None print 'Content file:', content_file_path print 'Filtering Russian language...' out = codecs.open(output_file_path, 'w+', encoding=ENCODING) with codecs.open(content_file_path, encoding=ENCODING) as content_file: for line in content_file: try: lang = detect(line) except lang_detect_exception.LangDetectException as e: pass if lang == 'ru': out.write(line) line_count += 1 if line_count % line_count_mod == 0: print line_count, 'lines processed...' line_count_mod *= 2 print line_count, 'lines processed in total.' out.close()
def produce_raw_layer(): try: print 'START: Insert of data into database at %s.' % datetime.datetime.now() cycle_start_time = datetime.datetime.now() recent_media_added = 0 users_added = 0 users_updated = 0 user_recent_media_added = 0 # Get recent popular media recent_media = api.media_popular(count=64) for media in recent_media: # Parse the recent popular media parsed_media = RawRecentMediaEntity.parse(media) # Determine if english speaking user, if so, continue ## TODO: Maybe detect all possible languages and then if 'en' is in it, it passes try: if langdetect.detect(parsed_media.caption_text) != 'en': continue except LangDetectException: continue # Save the parsed media parsed_media.save() recent_media_added += 1 user_recent_media_added, users_added, users_updated = handle_user_info(parsed_media, user_recent_media_added, users_added, users_updated) log_run_metrics(cycle_start_time, recent_media_added, users_added, users_updated, user_recent_media_added) except Exception as e: print("ERROR - userId: %d caused error: " + str(e)) pass
def fetch_item_lyrics(self, lib, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself.""" # Skip if the item already has lyrics. if not force and item.lyrics: self._log.info(u"lyrics already present: {0}", item) return lyrics = None for artist, titles in search_pairs(item): lyrics = [self.get_lyrics(artist, title) for title in titles] if any(lyrics): break lyrics = u"\n\n---\n\n".join([l for l in lyrics if l]) if lyrics: self._log.info(u"fetched lyrics: {0}", item) if HAS_LANGDETECT and self.config["bing_client_secret"].get(): lang_from = langdetect.detect(lyrics) if self.config["bing_lang_to"].get() != lang_from and ( not self.config["bing_lang_from"] or (lang_from in self.config["bing_lang_from"].as_str_seq()) ): lyrics = self.append_translation(lyrics, self.config["bing_lang_to"]) else: self._log.info(u"lyrics not found: {0}", item) fallback = self.config["fallback"].get() if fallback: lyrics = fallback else: return item.lyrics = lyrics if write: item.try_write() item.store()
def __process_tweet(self, tweet, group_name, flush_output=True, verbosity=True): # Fetch the full text of the tweet if 'extended_tweet' in tweet and tweet['extended_tweet']: if flush_output and verbosity: print("\t\t Extended tweet\n") text = tweet['extended_tweet']['full_text'] else: text = tweet['text'] text = text.encode('utf-16', 'surrogatepass').decode('utf-16') # Fetch the tweet source pattern = re.compile("(\>)(.+)(\<)") source = pattern.search(tweet['source']).group(2) if flush_output and verbosity: print("\t\tTweeted using {}\n".format(source)) if tweet['lang']: lang = tweet['lang'] print("\t\tTweet language is {}\n".format(lang)) else: lang = detect(text) if flush_output and verbosity: print("\t\tLanguage detected as {}\n".format(lang)) document = { '_id': tweet['id_str'], 'text': text, 'lang': lang, 'source': source, 'category': group_name, 'quotes': tweet['quote_count'], 'replies': tweet['reply_count'], 'faves': tweet['favorite_count'], 'retweets': tweet['retweet_count'], 'created_at': tweet['created_at'], 'quoted_tweet': tweet['quoted_status_id_str'] if 'quoted_status_id_str' in tweet else None, 'user': { '_id': tweet['user']['id_str'], 'name': tweet['user']['name'], 'username': tweet['user']['screen_name'], 'location': tweet['user']['location'], 'verified': tweet['user']['verified'], 'followers': tweet['user']['followers_count'], 'followings': tweet['user']['friends_count'], 'favourites': tweet['user']['favourites_count'], 'statuses': tweet['user']['statuses_count'] } } if flush_output: print("\t\tSaving the tweet... ") status, mode, record_id = self.dbi.upsert('tweets', {'_id': tweet['id_str']}, document) altered = False if status: if flush_output: print("Done") if mode == self.dbi.MODE_INSERTED: altered = True if flush_output: print(" - Inserted") elif mode == self.dbi.MODE_UPDATED: altered = True if flush_output: print(" - Updated") elif mode == self.dbi.MODE_NOT_CHANGED and flush_output: print(" - No Change") elif flush_output: print("Failed") if flush_output: print("\n") return altered
def _language(self, article): if not article.meta_lang: text = article.title + ' ' + article.summary return detect(text) return article.meta_lang
def find_or_create(cls, session, _url: str, language=None, sleep_a_bit=False): """ If not found, download and extract all the required info for this article. :param url: :return: """ from zeeguu_core.model import Url, Article, Language import newspaper url = Url.extract_canonical_url(_url) try: found = cls.find(url) if found: return found art = newspaper.Article(url=url) art.download() art.parse() if art.text == '': raise Exception("Newspaper got empty article from: " + url) if sleep_a_bit: import time from random import randint print("GOT: " + url) sleep_time = randint(3, 33) print( f"sleeping for {sleep_time}s... so we don't annoy our friendly servers" ) time.sleep(sleep_time) if not language: if art.meta_lang == '': art.meta_lang = detect(art.text) zeeguu_core.log(f"langdetect: {art.meta_lang} for {url}") language = Language.find_or_create(art.meta_lang) # Create new article and save it to DB url_object = Url.find_or_create(session, url) new_article = Article( url_object, art.title, ', '.join(art.authors), art.text[ 0: 32000], # any article longer than this will be truncated... art.summary, None, None, language) session.add(new_article) session.commit() return new_article except sqlalchemy.exc.IntegrityError or sqlalchemy.exc.DatabaseError: for i in range(10): try: session.rollback() u = cls.find(url) print("Found article by url after recovering from race") return u except: print("Exception of second degree in article..." + str(i)) time.sleep(0.3) continue break
def get_lang(text): return detect(text)
# -*- coding: utf-8 -*- """ Created on Sat Mar 3 20:20:50 2018 @author: rqz """ import pandas as pd #改下文件名 df = pd.read_csv('/Users/yilixia/Downloads/raw_test_ylxia.csv') from googletrans import Translator from langdetect import detect chinese = [] other_lan = [] for i in range(len(df)): if detect(df.iloc[i, 1]) == 'ko' or detect(df.iloc[i, 1]) == 'zh-tw': chinese.append(i) elif detect(df.iloc[i, 1]) != 'en': translator = Translator() fake2 = translator.translate(df.iloc[i, 1]) df.iloc[i, 1] = fake2.text other_lan.append(i) else: print(i) #剩下的中文手动翻译好了之后 输出 df.to_csv('/Users/yilixia/Downloads/translation_lyi.csv', index=False, encoding='utf-8')
def func_news_retrieve(*args, **kwarg): #init console log print("[01_news_retrieve] S Started job at " + str(datetime.datetime.utcnow())) #grab the current time dt = datetime.datetime.utcnow() #create a dictionary of rss feeds feeds = dict( thaipr_fin=r'http://www.thaipr.net/finance/feed', thaipr_property=r'http://www.thaipr.net/estate/feed', posttoday_econ=r'https://www.posttoday.com/rss/src/economy.xml', posttoday_fin=r'https://www.posttoday.com/rss/src/money.xml', posttoday_market=r'https://www.posttoday.com/rss/src/market.xml', posttoday_property=r'https://www.posttoday.com/rss/src/property.xml', bbkbiznews_buz=r'http://www.bangkokbiznews.com/rss/feed/business.xml', bkkbiznews_econ=r'http://www.bangkokbiznews.com/rss/feed/economic.xml', bkkbiznews_fin=r'http://www.bangkokbiznews.com/rss/feed/finance.xml', bkkbiznews_property= r'http://www.bangkokbiznews.com/rss/feed/property.xml', thaipbs_econ=r'http://news.thaipbs.or.th/rss/news/economy', matichon_econ=r'https://www.matichon.co.th/category/economy/feed', manager_stock= r'http://www.manager.co.th/RSS/StockMarket/StockMarket.xml', manager_mutualfund= r'http://www.manager.co.th/RSS/MutualFund/MutualFund.xml', manager_biz=r'http://www.manager.co.th/RSS/iBizChannel/iBizChannel.xml', ) news_cat = dict(thaipr_fin='Finance', thaipr_property='Property', posttoday_econ='Economy', posttoday_fin='Finance', posttoday_market='Business', posttoday_property='Property', bbkbiznews_buz='Business', bkkbiznews_econ='Economy', bkkbiznews_fin='Finance', bkkbiznews_property='Property', thaipbs_econ='Economy', matichon_econ='Economy', manager_stock='Finance', manager_mutualfund='Finance', manager_biz='Business') news_source = dict(thaipr_fin='ThaiPR', thaipr_property='ThaiPR', posttoday_econ='PostToday', posttoday_fin='PostToday', posttoday_market='PostToday', posttoday_property='PostToday', bkkbiznews_buz='BangkokBizNews', bkkbiznews_econ='BangkokBizNews', bkkbiznews_fin='BangkokBizNews', bkkbiznews_property='BangkokBizNews', thaipbs_econ='ThaiPBS', matichon_econ='Matichon', manager_stock='Manager', manager_mutualfund='Manager', manager_biz='Manager') data = [] count_insert = 0 count_duplicate = 0 filterBOTKeyword = [ 'ธปท', 'ธนาคารแห่งประเทศไทย', 'ธนาคารชาติ', 'ธนาคารกลาง', 'แบงค์ชาติ', 'แบงก์ขาติ', 'Bank of Thailand', 'กนง', 'คณะกรรมการนโยบายการเงิน', 'ศคง', 'ศูนย์คุ้มครองผู้ใช้บริการทางการเงิน', 'สถาบันวิจัยเศรษฐกิจป๋วย อึ๊งภากรณ์', 'กองทุนเพื่อการฟื้นฟู', 'FIDF', 'วิรไท สันติประภพ', 'ไพบูลย์ กิตติศรีกังวาน', 'เมธี สุภาพงษ์', 'วชิรา อารมย์ดี', 'จาตุรงค์ จันทรังษ์', 'ฤชุกร สิริโยธิน', 'รณดล นุ่มนนท์', 'สิริธิดา พนมวัน ณ อยุธยา', 'ณัฐวุฒิ พงศ์สิริ', 'เพิ่มสุข สุทธินุ่น', 'วรพร ตั้งสง่าศักดิ์ศรี', 'นวพร มหารักขกะ', 'พฤทธิพงศ์ ศรีมาจันทร์', 'สุภาวดี ปุณศรี', 'จันทวรรณ สุจริตกุล', 'ปิติ ดิษยทัต', 'สักกะภพ พันธ์ยานุกูล', 'ดอน นาครทรรพ', 'สุรัช แทนบุญ', 'ยรรยง ไทยเจริญ', 'รุ่ง มัลลิกะมาส' ] # Access the 'headlines' collection in the 'news' database client = pymongo.MongoClient() collection = client.sentifine.news_map collection_fin = client.sentifine.news_raw for feed, url in feeds.items(): rss_parsed = feedparser.parse(url) for art in rss_parsed['items']: #Filter only Thai language from title lang = detect(art['title']) #print(art) if lang == 'th': #Checking if each news related with BOT filter_bot = 'N' if any(k in str(art['title']) for k in filterBOTKeyword) or (any( k in str(art['title_detail']) for k in filterBOTKeyword)) or (any( k in str(art['summary']) for k in filterBOTKeyword)): filter_bot = 'Y' published = parser.parse(art['published']) sentiment_default = "Retrieved" m = { '_id': art['link'], 'title': art['title'], 'published': published, 'url_link': art['link'], 'retrieved': dt } r = { 'source': news_source.get(feed), 'source_url': feed, 'title': art['title'], 'published': published, 'title_detail': art['title_detail']['value'], 'summary': art['summary'], 'category': news_cat.get(feed), 'url_link': art['link'], 'retrieved': dt, 'filter_BOT': filter_bot, 'status': sentiment_default } #insert item by item because of the duplicate of some source's links try: count_insert = count_insert + 1 collection.insert_one(m) #news_map collection_fin.insert_one(r) #news_raw except pymongo.errors.DuplicateKeyError: count_insert = count_insert - 1 count_duplicate = count_duplicate + 1 #pass #allow only this exception except Exception as ex: print( "[01_news_retrieve] E Unexpected error while inserting collection news_map & news_raw." ) print(str(ex)) #raise else: print("[01_news_retrieve] W Non-Thai Content from: " + art['link']) #final log print("[01_news_retrieve] I Number of Duplicated Records :" + str(count_duplicate)) print("[01_news_retrieve] I Number of New Records :" + str(count_insert)) print("[01_news_retrieve] S Finished job at " + str(datetime.datetime.utcnow()))
def verify_language(self, text): """given a text, verify that it is in a relevant language""" return langdetect.detect(text) == 'fr'
from collections import Counter import matplotlib.pyplot as plt # read the csv file df = pd.read_csv("music_lyrics.csv") # remove columns that contains N/A values df.drop(df.columns[[0, 1]], axis=1, inplace = True) df1 = df.dropna() mydf = df1[df1.lyrics != 'No Lyrics'] DetectorFactory.seed = 0 # detect lyrics language types lang = [] for i in list(set(mydf.lyrics)): lang.append(detect(i)) # get the counts for languages of lyrics Counter(lang) # Pie chart for English lyrics and lyrics of other languages labels = ['English', 'Other languages'] sizes = [3982, 37] # change figure size with modify figsize fig, ax1 = plt.subplots(figsize=(8, 8)) # explsion explode = (0.05,0.05) ax1.pie( sizes, labels=labels, autopct='%1.1f%%', startangle=90,
def safedetect(text): try: return detect(text) except: return 'nan'
if count < running_from: continue #if count > running_to: # break print('Processing mm ' + str(count) + ': ' + mul_id) # retrieve data and save in SQLite data = [] if type == 'image': links = link_retrieval.find_related_links(abs_path, nPages) for l in links: text = getTextFromLink(l) try: if detect(text) == 'en': print("===>" + l) # accumulate data data.append((mul_id, l, text)) except Exception as e: print(e) if type == 'video': text = getTextFromVideoLink(abs_path) if text != '': print("===>" + abs_path) # accumulate data data.append((mul_id, abs_path, text)) # insert data into database c.executemany("INSERT INTO website_from_img VALUES (?,?,?)", data)
def find_lang(x): try: return detect(x) except: return 'none'
from gtts import gTTS import os from langdetect import detect Text_generated = "hello" language = detect(Text_generated) text = Text_generated speech = gTTS(text=text, lang=language, slow=False) speech.save("text2.mp3") os.system("start text2.mp3") '''import pytesseract import shutil import os import random try: from PIL import Image except ImportError: import Image import glob import cv2 def read_img(img_list, img): n = cv2.imread(img, 0) img_list.append(n) return img_list path = glob.glob("*.bmp") #or jpg list_ = [] cv_image = [read_img(list_, img) for img in path] image_path_in_colab=r'C:\\Users\\charv\\PicTalk\\uploads\\spacejam2.png'
def detect_title(title: str): str = title strc = " ".join([token.capitalize() for token in str.split()]) lang = resolve(detect(strc)) return lang
def tokenization_process( text: str ) -> list: #tokenization of text words using spacy and other techniques if re.sub(re.compile('\d|\:|\s|\-|\+|\!|\/|\,|\.|\=|\?|\!|\砰'), '', text) != '': #STOPWORDS lang = [ 'arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish' ] try: stopwords = set(sw.words('english')) except: stopwords = set() for l in lang: try: stopwords = stopwords.union(set(sw.words(l))) except: stopwords = stopwords try: language = detect(text) except: language = 'en' if (language == 'en'): nlp = spacy.load("en_core_web_sm") elif (language == 'zh'): nlp = spacy.load("zh_core_web_sm") elif (language == 'da'): nlp = spacy.load("da_core_news_sm") elif (language == 'nl'): nlp = spacy.load("nl_core_news_sm") elif (language == 'fr'): nlp = spacy.load("fr_core_news_sm") elif (language == 'de'): nlp = spacy.load("de_core_news_sm") elif (language == 'el'): nlp = spacy.load("el_core_news_sm") elif (language == 'it'): nlp = spacy.load("it_core_news_sm") elif (language == 'ja'): nlp = spacy.load("ja_core_news_sm") elif (language == 'lt'): nlp = spacy.load("lt_core_news_sm") elif (language == 'nb'): nlp = spacy.load("nb_core_news_sm") elif (language == 'pl'): nlp = spacy.load("pl_core_news_sm") elif (language == 'pt'): nlp = spacy.load("pt_core_news_sm") elif (language == 'ro'): nlp = spacy.load("ro_core_news_sm") elif (language == 'es'): nlp = spacy.load("es_core_news_sm") else: # print('inter') nlp = spacy.load("xx_ent_wiki_sm") tokens = [ x.lemma_.lower() for x in nlp(text) if (x.pos_ not in ['PUNCT', 'SPACE']) and (not x.is_stop) ] trash_tokens = [ '–', '-', 'le', 'de', 'del', 'dell', 'della', 'l', 'degli', "dell'", "l'", '’', 'l’', 'dell’', '.', '?', '!', '¡', 'a', 'do', '(', ')', 'e-', 'e', 'el', 'r', 'n', 'se', 'una', 'alla', 'la', "'", 'to', 'of', 'o', "'n", 'y', "'s", ',', "'t", 'don', 'the', '・', 'u', '」', '「', 'в', 'por', 'el', 'du', 'les', '' ] tokens = [ x for x in tokens if (x not in punctuation) and ( x not in stopwords) and (x not in trash_tokens) ] return tokens else: return [text]
def detect_language(x): return detect(x)
count = -1 with open('links.txt', 'r') as f: links = f.readlines() for i in trange(len(links)): if i <= count: continue link = links[i] try: html = requests.get(link).text except ConnectionError or ChunkedEncodingError: continue soup = BeautifulSoup(html, 'html.parser') song = soup.find_all('p', {'class': 'songtext'})[0] song_filtered = song.get_text() try: lang = detect(song_filtered[:50]) except Exception: continue if lang != 'en': with open('songs_done.txt', 'w') as done_file: done_file.write(str(i)) continue song_title = re.findall(r'/s.*?.html', link)[0][1:-5] try: with open(songs_folder + song_title + '.txt', 'w') as sf: sf.write(song_filtered) except UnicodeEncodeError as err: print(song_title + lang) print('%d files skipped' % (files_skipped + 1)) files_skipped += 1 try:
def detect_lang_type(x): lang_type=(detect(x)) return lang_type
def is_french(stringx): try: if detect(stringx.lower()) == 'fr': return True except: return False
def parse_page(self, response, **kwargs): try: o = {} title = response.xpath( '//h1[@class="citation_title"]/text()').get() if title: self.parser.reset() self.parser.feed(title) title = self.parser.get_text() d_l = detect(title) if d_l == 'fa': t = {'title_fa': title} else: return None t['title_en'] = None o.update(t) else: return None t = response.xpath( '(//span[@id="ar_row_ind"]/following-sibling::a)[1]/text()' ).get() if t is not None: t = re.findall(number_pattern, t) try: volume = int(t[0]) number = int(t[1]) except (IndexError, ValueError, TypeError): volume = None number = None download_url = response.xpath('//a[@class="pdf"]/@href').get() download_url = response.urljoin(download_url) if download_url: file_name = os.path.basename(download_url) else: file_name = None summary = response.xpath('//td[@id="abs_fa"]').get() if summary: self.parser.reset() self.parser.feed(summary) summary = self.parser.get_text() t = {'summary_fa': None, 'summary_en': None} if summary: d_l = detect(summary) if d_l == 'fa': t.update({'summary_fa': summary}) elif d_l == 'en': t.update({'summary_en': summary}) o.update(t) o.update({ 'volume': volume, 'number': number, 'file_name': '%s_%s' % (self.name, file_name) if file_name else None, 'download_url': download_url }) keywords = response.xpath( '//a[starts-with(@href, "./?_action=article&kw=")]/text()' ).getall() keywords_fa = [] keywords_en = [] for kw in keywords: d_l = detect(kw) if d_l == 'fa': keywords_fa.append(kw) elif d_l == 'en': keywords_en.append(kw) o['keywords_fa'] = keywords_fa if keywords_fa else None o['keywords_en'] = keywords_en if keywords_en else None yield o except LangDetectException: pass
def generate_meme(path , file_name ,cmnd_type , cmnd_value, tpos=1 , bpos=1 ): lang=detect(cmnds_value[0]) if lang =='ar': generate_meme_ar(path , file_name ,cmnd_type , cmnd_value, tpos=1 , bpos=1 ) return for cmn in range(len(cmnd_type)): temp = cmnd_type[cmn] if temp == 'top': top_txt = cmnd_value [cmn] elif temp == 'bot': bot_txt = cmnd_value [cmn] elif temp == 'bpos': b_per=int(cmnd_value [cmn]) elif temp =='tpos': t_per=int(cmnd_value [cmn]) elif temp =='font': font=int(cmnd_value [cmn]) elif temp =='font size': font_size=int(cmnd_value [cmn]) #tpo img = cv2.imread(path) ary=np.asarray(img) print(ary.shape) if tpos ==1:# left top_pos = 0 elif tpos ==2:#mid top_pos = int(ary.shape[0] / 2) else: top_pos = int(ary.shape[0]) if bpos ==1:# left bot_pos = 0 elif bpos ==2:#mid bot_pos = int(ary.shape[0] / 2) else: bot_pos = int(ary.shape[0]) print(100*ary.shape[1]/100) print( int(100*ary.shape[1]/100)) top_percent = int(t_per*ary.shape[1]/100) bot_percent = int(b_per*ary.shape[1]/100) cv2.putText(img, top_txt, (top_pos, top_percent), font, font_size, (255, 255, 255), font_size, cv2.LINE_AA) cv2.putText(img, bot_txt, (bot_pos, bot_percent), font, font_size, (255, 255, 255), font_size, cv2.LINE_AA) # Parameters are as follows: # # cv2.putText(img, text, (org), font, fontScale, color, thickness, linetype) # # img: your image # text: a string of text to print on image # org: bottom-left corner of the text string in the image (x,y) # font: font type # fontScale: font scale # color: text color (B,G,R) # thickness: text line thickness # lineType: line type (8) cv2.imwrite("./meme_generated/"+file_name,img)
content_buff += "\n" continue content_buff_temp = "" if (line == prev_st): content_buff_temp = prev_tok else: if language_mixed_en(line): word_list = active_content = line.split(" ") text_tmp = "" tokens_eng = "" tokens_th = "" for word in word_list: # print(word) try: lang = detect(word) except: lang = "en" if lang == "th": # tokenize_thai(text): # print(word + " : " + detect(word)) tokens_eng = tokenize_eng(text_tmp) tokens_th = tokenize_thai(word) content_buff_temp = content_buff_temp + " " + tokens_eng + " " + tokens_th tokens_eng = "" tokens_th = "" text_tmp = "" else: text_tmp = text_tmp + " " + word if text_tmp != "":
def detect(sentence): return langdetect.detect(sentence) != 'en'
if __name__ == "__main__": new_movies_df = pd.read_csv(NEW_MOVIES_PATH) new_movie_ids = get_movie_ids(new_movies_df.shape[0], count_movies) new_movies_data = list(new_movies_df.title + ". " + new_movies_df.plot) for ind, elem in enumerate(new_movies_df.description): if elem: new_movies_data[ind] += " " + elem sent_data = div_to_sent(new_movies_data) print(colored("Построение векторных представлений сюжетов", "yellow")) rubert_embedings = [] multilingual_embedings = [] for ind, plot in tqdm(enumerate(sent_data)): if detect(plot) != "ru": _, _, _, _, _, _, bert_pooler_outputs = multilingual_bert(plot) multilingual_embedings.append( {"embeding": bert_pooler_outputs.mean(axis=0), "index": ind} ) else: _, _, _, _, _, _, bert_pooler_outputs = rubert(plot) rubert_embedings.append( {"embeding": bert_pooler_outputs.mean(axis=0), "index": ind} ) multilingual_emb_matrix = [elem["embeding"] for elem in multilingual_embedings] rubert_emb_matrix = [elem["embeding"] for elem in rubert_embedings] print(colored("Предсказание векторов SVD", "yellow")) multilingual_svd = multilingual_2_svd.predict(multilingual_emb_matrix)
def from_txt(self, file): with open(file, 'r') as f: data = f.readlines() for paragraph in data: if detect(paragraph) == 'en': self.paragraphs.append(Text(paragraph))
def language_detect(entry): try: return detect(entry) except Exception as e: print(f"{e} Using default language as english.") return 'en'
def langtype(v): try: lang = detect(unicode(v)[0]) return lang except: return v