def test_guess(self): for text, name in self.tests: self.assertEqual(guess_language(text), name) for text, name, hints in self.tests_with_hints: self.assertEqual(guess_language(text, hints), name) text = "Vérifions que le détecteur de langue fonctionne." self.assertEqual(guess_language_tag(text), "fr") self.assertEqual(guess_language_name(text), "French") self.assertEqual(guess_language_id(text), 26150) self.assertEqual(guess_language_info(text), ("fr", 26150, "French"))
def parse(self, **kwargs): try: path_file = kwargs.get('p', None) MASTER_MD_DICT = kwargs.get('md', None) multiline = kwargs.get('m', None) is_debug = kwargs.get('is_debug', None) stay_offline = kwargs.get('xo', None) data_dict = {'mainfile': path_file, 'depends': [] } try: with open(path_file, encoding='utf-8') as input_file: content = input_file.read() if multiline: # reset key; try guess lang: data_dict['paperLanguage'] = [] t = re.search(r'([\w\d\s\.\,\:]{300,1200})', content, flags=re.DOTALL) if t: if guess_language(t.group(1)) is not None: data_dict['paperLanguage'].append(guess_language(t.group(1))) else: data_dict['paperLanguage'] = [] # process rules for rule in rule_set_rmd_multiline: this_rule = rule.split('\t') s = re.search(this_rule[1], content, flags=re.DOTALL) if s: if this_rule[0].startswith('yaml'): from parsers.parse_yaml import ParseYaml parsed = ParseYaml().internal_parse(s.group(1), MASTER_MD_DICT, stay_offline, is_debug) if parsed == 'error': return parsed else: data_dict.update(parsed) if this_rule[0].startswith('rblock'): data_dict = parse_r(s.group(1), data_dict) else: # parse entire file as one code block data_dict.update(r_codeblock=parse_r(content, data_dict)) except UnicodeDecodeError: status_note(['! error, failed to decode <', md_file, '>'], d=is_debug) return 'error' # save to list of extracted metadata: data_dict['provenance'] = get_prov(path_file) return data_dict # save or output results # todo: reenable that option: #if metafiles_all: # output_extraction(data_dict, out_format, out_mode, path_file) except Exception as exc: status_note('! error while extracting Rmd', d=is_debug) return 'error'
def _to_db_row(fields): source_text = fields.get('sourceText', "").strip() target_text = fields.get('targetText', "").strip() res = {} if source_text: res.update({ 'source_text': source_text, 'source_lang': guess_language(source_text), }) if target_text: res.update({ 'target_text': target_text, 'target_lang': guess_language(target_text), }) return res
def get_language_tool_results(filename, file_contents, locale): joined_text = "".join(file_contents) locale = guess_language(joined_text) if locale == 'auto' else locale locale = 'en-US' if not locale else locale tool = LanguageTool(locale) matches = tool.check(joined_text) for match in matches: if not match.replacements: diffs = None else: replaced = correct(joined_text, [match]).splitlines(True) diffs = {filename: Diff.from_string_arrays(file_contents, replaced)} rule_id = match.ruleId if match.subId is not None: rule_id += '[{}]'.format(match.subId) message = match.msg + ' (' + rule_id + ')' yield message, diffs, SourceRange.from_values(filename, match.fromy+1, match.fromx+1, match.toy+1, match.tox+1)
def urldata(url): print('Opening page...', end=' ') req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) html = urllib.request.urlopen(req).read() print('Done') print('Parsing...', end=' ') soup = BeautifulSoup(html, 'html.parser') print('Done') # print(soup.prettify()) print('Finding all text...', end=' ') text_findAll = soup.findAll(text=True) print('Done') print('Filtering visible texts...', end=' ') visible_texts = [] for line in filter(visible, text_findAll): line = line.strip() if line: visible_texts.append(line) print('Done') print('Filtering arabic texts...', end=' ') arabic = [] for l in visible_texts: if guess_language.guess_language(l) == 'ar': arabic.append(l) print('Done') return arabic
def find_out_language(candidate_languages, *args): candidates = [] for sample in args: candidate = guess_language(sample) if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages: candidates.append(candidate) try: for candidate in detect_langs(sample): if candidate.lang in candidate_languages: candidates.append(candidate.lang) except LangDetectException: continue if len(candidates) == 0: return None leading_candidate = { 'lang': candidates[0], 'count': candidates.count(candidates[0]) } for leading_candidate in candidates[1:0]: if leading_candidate['count'] < candidates.count(candidate): leading_candidate['lang'] = candidate leading_candidate['size'] = candidates.count(candidate) if leading_candidate['lang'] == UNKNOWN_LANGUAGE: return None return leading_candidate['lang']
def fetch_word(): Tweet = db.Tweet session = db.session tweets = session.query(Tweet).limit(1000) all_words = '' for tweet in tweets: if tweet.text: #print tweet.text # Initializing final dictionary {tweet:tfidf total} & {tweet:# of words} tweet_tfidf_dict[tweet.text] = None tweet_wordcount_dict[tweet.text] = None if guess_language(tweet.text) == 'en': tweet_text = str(tweet.text.encode('utf-8')) tweet_lower = tweet_text.lower() tweet_final = tweet_lower.translate(None, string.punctuation) # Concatenate all words (including hashtags) in tweet all_words = all_words + ' ' + tweet_final return all_words
def main(): rootdirname = sys.argv[1] #read all the files in the directory with .txt extension print(rootdirname) listfiles = recursive_glob(rootdirname,"*.txt") if len(listfiles) == 0: print ("No files present") sys.exit(1) i = 0 randfileslist = random.sample(listfiles,25) for file in randfileslist: print ("Current file is :",file) f = open(file, "r") lines = f.readlines() txtfile = open('tweetdata.txt',mode ='at') for line in lines: try: tweet = simplejson.loads(line) if "text" in tweet: text = str(tweet["text"].encode('utf-8','ignore')) language = guess_language(text) if language == 'en': i = i +1 if i == 501: i = 0 break txtfile.write(line) except ValueError: pass
def run(self, filename, file, natural_language: str = 'auto', languagetool_disable_rules: typed_list(str) = (), ): """ Checks the code with LanguageTool. :param natural_language: A locale representing the language you want to have checked. If set to 'auto' the language is guessed. If the language cannot be guessed or an unsupported language is guessed, 'en-US' is used. :param languagetool_disable_rules: List of rules to disable checks for. """ # Defer import so the check_prerequisites can be run without # language_check being there. from language_check import LanguageTool, correct joined_text = ''.join(file) natural_language = (guess_language(joined_text) if natural_language == 'auto' else natural_language) try: tool = LanguageTool(natural_language, motherTongue='en_US') except ValueError: # Using 'en-US' if guessed language is not supported logging.warn( "Changing the `natural_language` setting to 'en-US' as " '`language_check` failed to guess a valid language.' ) natural_language = 'en-US' tool = LanguageTool(natural_language, motherTongue='en_US') tool.disabled.update(languagetool_disable_rules) matches = tool.check(joined_text) for match in matches: if not match.replacements: diffs = None else: replaced = correct(joined_text, [match]).splitlines(True) diffs = {filename: Diff.from_string_arrays(file, replaced)} rule_id = match.ruleId if match.subId is not None: rule_id += '[{}]'.format(match.subId) message = match.msg + ' (' + rule_id + ')' source_range = SourceRange.from_values(filename, match.fromy+1, match.fromx+1, match.toy+1, match.tox+1) yield Result(self, message, diffs=diffs, affected_code=(source_range,))
def getFeedLanguage(feed, debug=False): print "Feed:", feed.feed.title langCount = defaultdict(int) guessed_lang = guess_language(feed.feed.title) # .encode('utf-8') langCount[guessed_lang] += 2 for entry in feed.entries[:10]: if hasattr(entry, 'title'): txtValue = entry.title # print entry.title soup = getSoupParser(txtValue) txtValue = ' '.join(soup.findAll(text=True)) if debug: print "entry-title:", txtValue guessed_lang = guess_language(txtValue) langCount[guessed_lang] += 1 # print entry if hasattr(entry, 'summary'): txtValue = entry.summary soup = getSoupParser(txtValue) txtValue = ' '.join(soup.findAll(text=True)) if debug: print "entry-summary:", txtValue guessed_lang = guess_language(txtValue) langCount[guessed_lang] += 1 # print entry.content[0] if hasattr(entry, 'content'): allContents = ' '.join([content.value for content in entry.content]) print allContents soup = getSoupParser(txtValue) txtValue = ' '.join(soup.findAll(text=True)) if debug: print "entry-allcontents:", txtValue guessed_lang = guess_language(txtValue) langCount[guessed_lang] += 1 langCount['UNKNOWN'] = 0 if debug: print "langCount:", langCount key, value = max(langCount.iteritems(), key=lambda x: x[1]) return key
def test_guess_enchant(self): languages = enchant.list_languages() for text, name in self.enchant_tests: if any(language.startswith(name) for language in languages): self.assertEqual(guess_language(text), name) else: warnings.warn("no spelling dictionary for language {!r}" .format(name))
def lang_percentage(statusi, language="mk"): """ Vrakja procent na tvitovi koi se na odredeniot jazik. """ n = 0 for status in statusi: if guess_language(unicode(status.text)) == language: n = n + 1 if len(statusi) == 0: return 0 return n / float(len(statusi))
def on_success(self, status_data): # check if the message has any text if 'text' in status_data: valid = True if self.lang_filter: inferred_lang = guess_language(status_data['text']) valid = True if (inferred_lang == self.lang_filter or inferred_lang == UNKNOWN) else False if valid: self.queue.put(status_data, False)
def filterEnglishTweets(self, filePathToFilter): lines = [] newLines = [] with open(filePathToFilter, 'r') as fileToFilter: lines = fileToFilter.readlines() for line in lines: if guess_language(line.decode("utf-8")) == u"en": newLines.append(line) with open(filePathToFilter, 'w') as fileToFilter: for line in newLines: fileToFilter.write(line)
def strip_stopwords(words): text = ' '.join(words) language_code = guess_language.guess_language(text) if language_code == 'UNKNOWN': return words stopwords = munin.stopwords.load_stopwords(language_code) if not stopwords: return words return filter(lambda w: w not in stopwords, words)
def on_success(self, status_data): # check if the message has any text if 'text' in status_data: valid = True if self.lang_filter: inferred_lang = guess_language(status_data['text']) valid = True if (inferred_lang == self.lang_filter or inferred_lang == UNKNOWN) else False if valid: tweet = "@{author}: {text}".format(author=status_data['user']['screen_name'], text=status_data['text']) self.logger.debug(tweet)
def reply_post(): form = PostForm(request.form) language = guess_language(form.body.data) if language == 'UNKNOWN' or len(language) > 5: language = '' parent = Post.query.filter_by(path=form.parent_id.data).first() post = Post(body=form.body.data, parent=parent, author=current_user, language=language) post.save() flash('Your post is now live!') # TODO: Figure out how to add post without redirecting; if redirecting is necessary, then have it redirect to the last known visted page return redirect(url_for('main.index'))
def normalize_namen(groups): new_groups = [] for titles, members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste in groups: title_de = titles[0] title_fr = titles[1] if len(titles) > 1 else None title_it = titles[2] if len(titles) > 2 else None if (title_fr and not guess_language(title_fr, ['de', 'fr', 'it']) in ['fr', 'UNKNOWN']): print("Warning: title_fr '{}' guess lanuage is guessed '{}'\n". format(title_fr, guess_language(title_fr, ['de', 'fr', 'it']))) if (title_it and not guess_language(title_it, ['de', 'fr', 'it']) in ['it', 'UNKNOWN']): print("Warning: title_it '{}' guess lanuage is guessed '{}'\n". format(title_it, guess_language(title_it, ['de', 'fr', 'it']))) new_groups.append( (clean_whitespace(title_de), clean_whitespace(title_fr), clean_whitespace(title_it), members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste)) return new_groups
def get_lang_guess(text: str) -> str: # Get language using guess result = "" try: result = guess_language(text) if not result or (result != "UNKNOWN" and result in glovar.lang_protect): return "" except Exception as e: logger.info(f"Get lang guess error: {e}", exc_info=True) return result
def index(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() """pref = Preferences(preferPost = Post.query.order_by(Post.id.desc()).first(), science = bool(form.science.data), sport = bool(form.sport.data), people = bool(form.people.data), policy = bool(form.policy.data)) db.session.add(pref) db.session.commit()""" if form.file.data: print(Post.query.order_by(Post.id.desc()).first()) print("Name saved file: ", form.file.data.name) newFile = FileContent(name=form.file.data.filename, data=form.file.data.read(), postId=Post.query.order_by( Post.id.desc()).first()) db.session.add(newFile) db.session.commit() flash(_('Your post is now live!')) return redirect(url_for('main.index')) page = request.args.get('page', 1, type=int) posts = current_user.followed_posts().paginate( page, current_app.config['POSTS_PER_PAGE'], False) next_url = url_for('main.index', page=posts.next_num) \ if posts.has_next else None prev_url = url_for('main.index', page=posts.prev_num) \ if posts.has_prev else None posts = formatLaTeX(posts.items) leng = User.query.filter_by(username=current_user.username).first() len_post = leng.len_post return render_template('index.html', title=_('Home'), form=form, posts=posts, next_url=next_url, prev_url=prev_url, len_post=int(len_post))
def run(self, filename, file, natural_language: str='auto', languagetool_disable_rules: typed_list(str)=()): ''' Checks the code with LanguageTool. :param natural_language: A locale representing the language you want to have checked. If set to 'auto' the language is guessed. If the language cannot be guessed, 'en-US' is used. :param languagetool_disable_rules: List of rules to disable checks for. ''' # Defer import so the check_prerequisites can be run without # language_check being there. from language_check import LanguageTool, correct joined_text = ''.join(file) natural_language = (guess_language(joined_text) if natural_language == 'auto' else natural_language) natural_language = 'en-US' if not natural_language \ else natural_language tool = LanguageTool(natural_language, motherTongue='en_US') tool.disabled.update(languagetool_disable_rules) matches = tool.check(joined_text) for match in matches: if not match.replacements: diffs = None else: replaced = correct(joined_text, [match]).splitlines(True) diffs = {filename: Diff.from_string_arrays(file, replaced)} rule_id = match.ruleId if match.subId is not None: rule_id += '[{}]'.format(match.subId) message = match.msg + ' (' + rule_id + ')' source_range = SourceRange.from_values(filename, match.fromy+1, match.fromx+1, match.toy+1, match.tox+1) yield Result(self, message, diffs=diffs, affected_code=(source_range,))
def main(): # empty lists to store the nepalese and english usernames nepalese = [] english = [] restrictions = ['”','…', '“', '’', 'ç', '‘', '..'] #guess_language detects these symbols as non-english location = 'd:\Users\user\Desktop\weets.csv' df = pd.read_csv(location) #Dataframe containing the orgiinal tweets for user in df.text: for rest_syb in restrictions: user = user.replace(rest_syb, '') try: if guess_language(user) == "en": english.append(user) elif guess_language(user) == "UNKNOWN": english.append(user) elif guess_language(user) == "ne": nepalese.append(user) else: english.append(user) except: nepalese.append(user) #Creating a dataset NameDataSet = list(zip(nepalese, english)) name_types = pd.DataFrame(data = NameDataSet, columns = ['Nepali Text','English Text']) name_types.to_csv('tweets_filter2.csv',index= False, header= True)
def get_headline(topic, otit, topic_1_name): ''' Gets the url of the topic. Parses the html for all the titles. Checks each title: wasn't used recently to generat a title, contains the topic, doesn't end in ellipsis (...), is in spanish. doesn't already contain the text that will be used as replacement If at least one title is valid it selects one randomly and returns it. ''' valid_headlines = [] sleep(1) tree = html.parse(urlopen(topic['url'])) headlines = tree.xpath("//span[@class='titletext']") while len(headlines) > 0: headline = headlines.pop() regex = re.compile(r'\b{0}\b'.format(topic['name']), re.IGNORECASE) if headline.text_content() in otit: log.info('Headline recently used to generate title: %s', headline.text_content()) continue if regex.search(headline.text_content()) is None: log.debug('Invalid headline, regex failed on: %s', headline.text_content()) continue if headline.text_content()[-3:] == '...': log.debug('Invalid headline, ellipsis on: %s', headline.text_content()) continue if guess_language(headline.text_content()) != 'es': log.debug('Invalid headline, not spanish on: %s', headline.text_content()) continue if topic_1_name in headline.text_content(): log.debug('Invalid headline %s, contains replacement: %s', headline.text_content(), topic_1_name) continue valid_headlines.append(headline.text_content()) log.debug('Valid headline: %s', headline.text_content()) if len(valid_headlines) > 0: log.info('%s valid headlines found for topic %s', len(valid_headlines), topic['name']) return (random.choice(valid_headlines)) else: log.info('No valid headlines found for topic %s', topic['name']) return False
def nlp(): user = User.query.filter_by(username=current_user.username).first_or_404() form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_('Форма заполнена')) return redirect(url_for('main.nlp')) return render_template("nlp.html", form=form, user=user)
def send_message(recipient): user = User.query.filter_by(username=recipient).first_or_404() form = MessageForm() if form.validate_on_submit(): language = guess_language(form.message.data) if language == 'UNKNOWN' or len(language) > 5: language = '' msg = Message(author=current_user, recipient=user, body=form.message.data, language=language) db.session.add(msg) user.add_notification('unread_message_count', user.new_messages()) db.session.commit() flash('Your message has been sent.') return redirect(url_for('main.user', username=recipient)) # redirect (when and) only when form is successfully submitted return render_template('send_message.html', title='Send Message', form=form, user=user)
def notes(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_("Ваш пост опубликован!")) return redirect(url_for("main.notes")) posts = current_user.followed_posts().all() return render_template("main/notes.html", notes=posts, form=form)
def create_subreddit(): form = CreateSubredditForm() if form.validate_on_submit(): subreddit_language = guess_language(form.description.data) if subreddit_language == 'UNKNOWN' or len(subreddit_language) > 5: subreddit_language = '' new_subreddit = Subreddit(name=form.name.data, description=form.description.data, language=subreddit_language) db.session.add(new_subreddit) db.session.commit() return redirect(session['prior_thread_create_page']) return render_template('create_subreddit.html', form=form, page_title=_('Reddit - Create Subreddit'))
def POST(self, text): """ Return the guessed language of POSTed text. Args: text (str): text to guess the language of. Returns: A dict with the guessed language. For example: {'language': 'en'} """ lang = guess_language(text) return {'language': lang}
def index(): form=PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language)>5: language='' post=Post(body=form.post.data,author=current_user,language=language) db.session.add(post) db.session.commit() flash(_("Your post has been created")) return redirect(url_for('main.index')) page=request.args.get('page',1,type=int) posts = current_user.followed_posts().paginate(page,current_app.config['POSTS_PER_PAGE'],False) next_url=url_for('main.index',page=posts.next_num) if posts.has_next else None prev_url=url_for('main.index',page=posts.prev_num) if posts.has_prev else None return render_template("index.html",title=_("home"),posts=posts.items,form=form,next_url=next_url,prev_url=prev_url)
def index(): form=PostForm() if form.validate_on_submit(): language=guess_language(form.post.data) if language=='UNKNOWN' or len(language)>10: language="" post=Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_('Your post is now live!')) return redirect(url_for('index')) page=request.args.get('page',1,type=int) posts=current_user.followed_posts().paginate(page, app.config['POST_PER_PAGE'],False) next_url=url_for('index',page=posts.next_num) if posts.has_next else None prev_url=url_for('index',page=posts.prev_num) if posts.has_prev else None return render_template('index.html',title=_('Home'), form=form, posts=posts.items,next_url=next_url,prev_url=prev_url)
def index(): pform = PostForm() if pform.validate_on_submit(): language = guess_language(pform.post.data) if language == "UNKNOWN" or len(language) > 5: language = "" post = Post(body=pform.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_("Your post is now live!")) return redirect(url_for("main.index")) page = request.args.get("page", 1, type=int) posts = current_user.followed_posts().paginate(page, current_app.config["POSTS_PER_PAGE"], False) next_url = url_for("main.index", page=posts.next_num) if posts.has_next else None prev_url = url_for("main.index", page=posts.prev_num) if posts.has_prev else None return render_template("index.html", title=_("Home"), form=pform, posts=posts.items, next_url=next_url, prev_url=prev_url)
def new_ad(): form = AdForm() if form.validate_on_submit(): language = guess_language(form.description.data) if language == 'UNKNOWN' or len(language) > 5: language = '' new = Ad(title=form.title.data, category=form.category.data, description=form.description.data, language=language, author=current_user) db.session.add(new) db.session.commit() flash(_('New ad posted!')) return redirect(url_for('main.index')) return render_template('new_ad.html', title=_('New ad'), form=form)
def index(): form = PostForm() # 게시글 작성하기 폼 if form.validate_on_submit(): #새 게시물의 언어를 저장 language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' # 언어가 알려지지 않은 상태로 돌아오거나 예기치 않게 긴 결과가 나오면 안전하게 재생하고 빈 문자열을 데이터베이스에 저장. # 언어가 빈 문자열로 설정된 게시물은 알 수 없는 언어로 간주된다는 규칙을 채택 post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash('Your post is now live!') return redirect(url_for('main.index')) # 웹 양식을 제출한 후 사용자가 실수로 페이지를 새로고침할 때 중복으로 게시물이 삽입되지 않도록 하기 위해 # redirection으로 응답해줌 -> Post/Redirect/Get 패턴 (그래서 POST,GET 두 경로에서 요청을 수락하도록 설정함 ) # ==> POST 요청이 리디렉션으로 응답되면 이제 브라우저는 리디렉션에 표시된 페이지를 가져오기 위해 # GET 요청을 보내도록 지시 받으므로 마지막 요청은 더 이상 POST 요청이 아님 -> 새로 고침할 때 중복 게시물 삽입되는 것을 방지 """ posts = [ { 'author': {'username': '******'}, 'body': 'Beautiful day in Portland!' }, { 'author': {'username': '******'}, 'body': 'The Avengers movie was so cool!' } ] """ # posts = current_user.followed_posts().all() # 팔로우한 유저의 게시글들 가져오기 # 페이지 매김해서 게시글 가져오기 page = request.args.get('page', 1 ,type=int) # 1. page 쿼리 문자열 인수 또는 기본값 1에서 표시할 페이지 번호를 결정한 다음, posts = current_user.followed_posts().paginate( # 2. 원하는 결과 페이지만 검색하기 위해 paginate() 메서드 사용 page, current_app.config['POSTS_PER_PAGE'], False) # 페이지 크기를 결정하는 POSTS_PER_PAGE 구성 항목은 app.config 개체를 통해 액세스됩니다. # 다음 및 이전 페이지 링크 생성하기 next_url = url_for('main.index', page=posts.next_num) \ if posts.has_next else None prev_url = url_for('main.index', page=posts.prev_num) \ if posts.has_prev else None # view 함수의 next_url 및 prev_url은 (Flask-SQLAlchemy의 Pagination 클래스 객체에 있음) # 해당 방향에 페이지가 있는 경우에만 url_for()에서 반환하는 URL로 설정. return render_template('index.html', title='Home Page', form=form, posts=posts.items, next_url=next_url, prev_url=prev_url)
def guess_external_subtitles(dest_folder, subtitles): for subtitle, language in subtitles.items(): if not language: subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext( subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: logging.debug( "BAZARR falling back to file content analysis to detect language." ) detected_language = None # to improve performance, skip detection of files larger that 1M if os.path.getsize(subtitle_path) > 1 * 1024 * 1024: logging.debug( "BAZARR subtitles file is too large to be text based. Skipping this file: " + subtitle_path) continue with open(subtitle_path, 'rb') as f: text = f.read() try: guess = chardet.detect(text) logging.debug('BAZARR detected encoding %r', guess) text = text.decode(guess["encoding"]) detected_language = guess_language(text) except (UnicodeDecodeError, TypeError): logging.exception( "BAZARR subtitles file doesn't seems to be text based. Skipping this file: " + subtitle_path) except: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + subtitle_path + ' You should try to delete this subtitles file manually and ask ' 'Bazarr to download it again.') else: if detected_language: logging.debug( "BAZARR external subtitles detected and guessed this language: " + str(detected_language)) try: subtitles[subtitle] = Language.rebuild( Language.fromietf(detected_language)) except: pass return subtitles
def post_comment(): form = g.comment_form post_id = request.form['id'] body = request.form['comment'] post = Post.query.filter_by(id=post_id).first() language = guess_language(body) if language == 'UNKNOWN' or len(language) > 5: language = '' comment = Comment(body=body,language=language,post_id=int(post_id),user_id=current_user.id) db.session.add(comment) db.session.commit() return jsonify({ 'comment_num':post.comments.count(), 'avatarURL':current_user.avatar(70), 'comment_username':current_user.username, })
def index(page=1): form = PostForm() if form.validate_on_submit(): now_time = datetime.utcnow() language = guess_language(form.post.data) if language == 'UNKNOW' or len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=now_time, author=g.user, language=language) db.session.add(post) db.session.commit() flash('Your post is now live') return redirect(url_for('auth.index')) user = g.user posts = user.followed_posts().paginate(page, current_app.config['POSTS_AVG_PAGE'], False) return render_template('index.html', title='Home', user=user, posts=posts, form=form)
def new_post(): if not current_user.can('NEW_POST'): abort(403) form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' category = Category.query.get(form.category.data) post = Post(title=form.title.data, body=form.post.data, author=current_user, language=language, category=category) db.session.add(post) db.session.commit() flash(_('Your post is now live!')) return redirect(url_for('main.show_post', post_id=post.id)) return render_template('new_post.html', form=form)
def articleeditorWTF(): ckarticle = CKarticle() if ckarticle.validate_on_submit(): language = guess_language(ckarticle.content.data) if language == 'UNKNOWN' or len(language) > 5: language = '' article = Article(body=ckarticle.content.data, author=current_user, language=language, title=ckarticle.title.data) db.session.add(article) db.session.commit() flash(_('Your article is now live!')) # return json.dumps({'body': str(form.post.data), 'author': current_user,'language': language, 'title': str(form.title.data)}) return redirect(url_for('main.articleditorWTF')) # flash(current_user.username) return render_template('/ckeditor/articleEditorWTF.html', form=ckarticle)
def register(): if current_user.is_authenticated: return redirect(url_for('index')) form = RegistrationForm() if form.validate_on_submit(): language = guess_language(form.username.data) if language == "UNKNOWN" or len(language) > 5: language = '' user = User(username=form.username.data, email=form.email.data, language=language) user.set_password(form.password.data) db.session.add(user) db.session.commit() flash('Congratulations, you are now a registered user!') return redirect(url_for('login')) return render_template('register.html', title='Register', form=form)
def index(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_('Postado com sucesso!')) return redirect(url_for('main.index')) page = request.args.get('page', 1, type=int) pagination = current_user.followed_posts().paginate(page, current_app.config['POSTS_PER_PAGE'], False) posts = pagination.items return render_template('index.html', title=_('Página inicial'), posts=posts, form=form, pagination=pagination, endpoint='main.index')
def extract_keywords(text, use_stemmer=True): """Extract the keywords from a certain text. :param use_stemmer: If True a Snowball Stemmer will be used for all words. :returns: A sorted mapping between a set of keywords and their rating. :rtype: :class:`collections.OrderedDict` """ language_code = guess_language.guess_language(text) phrases = extract_phrases(split_sentences(text), language_code, use_stemmer) # This can happen if no stopwords are available, or a one-word input was used. if phrases is None: return None, OrderedDict() scores = word_scores(phrases) keywords = candidate_keywordscores(phrases, scores) return language_code, filter_subsets(keywords)
def get_txt(df, field, output_path): df.fillna('', inplace=True) output_str = '' tot = df.shape[0] cnt = 0 for idx, row in df.iterrows(): try: if row[field] is not '' and guess_language(row[field]) == 'en': output_str += (row[field].encode('utf-8') + '\n') except Exception as e: print type(e) print e.args print e cnt += 1 print '%d/%d' % (cnt, tot) with open(output_path, 'w') as f: f.write(output_str.replace('&', ''))
def index(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_('Your post is now live', category='info')) return redirect(url_for('main.index')) return render_template('index.html', title='Home Page', form=form, posts=Post.query.all())
def guess_external_subtitles(dest_folder, subtitles): for subtitle, language in six.iteritems(subtitles): if not language: subtitle_path = os.path.join(dest_folder, subtitle) if os.path.exists(subtitle_path) and os.path.splitext( subtitle_path)[1] in core.SUBTITLE_EXTENSIONS: logging.debug( "BAZARR falling back to file content analysis to detect language." ) if is_binary(subtitle_path): logging.debug( "BAZARR subtitles file doesn't seems to be text based. Skipping this file: " + subtitle_path) continue detected_language = None if six.PY3: with open(subtitle_path, 'r', errors='ignore') as f: text = f.read() else: with open(subtitle_path, 'r') as f: text = f.read() try: encoding = UnicodeDammit(text) if six.PY2: text = text.decode(encoding.original_encoding) detected_language = guess_language(text) except Exception as e: logging.exception( 'BAZARR Error trying to detect language for this subtitles file: ' + subtitle_path + ' You should try to delete this subtitles file manually and ask ' 'Bazarr to download it again.') else: if detected_language: logging.debug( "BAZARR external subtitles detected and guessed this language: " + str(detected_language)) try: subtitles[subtitle] = Language.rebuild( Language.fromietf(detected_language)) except: pass return subtitles
def index(page=1): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN': language = 'en' elif len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language) db.session.add(post) db.session.commit() flash( gettext('Your post is now live!') ) return redirect(url_for('index')) posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) return render_template('index.html', title = 'Home', form = form, posts = posts)
def edit_profile(): form = EditProfileForm(current_user.username) if form.validate_on_submit(): current_user.username = form.username.data current_user.about_me = form.about_me.data language = guess_language(form.about_me.data) if language == 'UNKNOWN' or len(language) > 5: language = '' current_user.language = language db.session.commit() flash(_('Your changes have been saved.')) return redirect(url_for('main.edit_profile')) elif request.method == 'GET': form.username.data = current_user.username form.about_me.data = current_user.about_me return render_template('edit_profile.html', title=_('Edit Profile'), form=form)
def index(page=1): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN': language = 'en' elif len(language) > 5: language = '' post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language) db.session.add(post) db.session.commit() flash(gettext('Your post is now live!')) return redirect(url_for('index')) posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) return render_template('index.html', title='Home', form=form, posts=posts)
def index(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash('Your post is now live!') return redirect(url_for('index')) posts = current_user.followed_posts().all() return render_template("index.html", title='Home Page', form=form, posts=posts)
def run(self, filename, file, language: str='auto', languagetool_disable_rules: typed_list(str)=()): ''' Checks the code with LanguageTool. :param language: A locale representing the language you want to have checked. If set to 'auto' the language is guessed. If the language cannot be guessed, 'en-US' is used. :param languagetool_disable_rules: List of rules to disable checks for. ''' joined_text = "".join(file) language = (guess_language(joined_text) if language == 'auto' else language) language = 'en-US' if not language else language tool = LanguageTool(language, motherTongue="en_US") tool.disabled.update(languagetool_disable_rules) matches = tool.check(joined_text) for match in matches: if not match.replacements: diffs = None else: replaced = correct(joined_text, [match]).splitlines(True) diffs = {filename: Diff.from_string_arrays(file, replaced)} rule_id = match.ruleId if match.subId is not None: rule_id += '[{}]'.format(match.subId) message = match.msg + ' (' + rule_id + ')' source_range = SourceRange.from_values(filename, match.fromy+1, match.fromx+1, match.toy+1, match.tox+1) yield Result(self, message, diffs=diffs, affected_code=(source_range,))
def index(): form = PostForm() if form.validate_on_submit(): language = guess_language(form.post.data) if language == 'UNKNOWN' or len(language) > 5: language = '' post = Post(body=form.post.data, author=current_user, language=language) db.session.add(post) db.session.commit() flash(_('Your post is now live!')) return redirect(url_for('main.index')) page = request.args.get('page', 1, type=int) posts = current_user.followed_posts().paginate( page, current_app.config['POSTS_PER_PAGE'], False) next_url = url_for('main.index', page=posts.next_num) \ if posts.has_next else None prev_url = url_for('main.index', page=posts.prev_num) \ if posts.has_prev else None return render_template('index.html', title=_('Home'), form=form, posts=posts.items, next_url=next_url, prev_url=prev_url)
def index(page=1): post_form = PostForm() if post_form.validate_on_submit(): # Form validation successful. # Store the post into database. # The following 4 lines (creating the post object and setting values) could have been done in just # one line: post = Post(body = post_form.post.data, timestamp = datetime.utcnow(), author = g.user) post = Post() post.body = post_form.post.data post.timestamp = datetime.utcnow() post.user_id = g.user.id # this could have been done as post.author = g.user language = guess_language(post.body) # we'll try to automatically detect the post language if language == 'UNKNOWN' or len(language) > 5: language = '' post.language = language db.session.add(post) db.session.commit() flash('Your post is now live!', 'info') return redirect(url_for('index')) # If we're here then we one of these things happened: # a) the page is just opening, or # b) form validation failed # Either way, show the index page with posts. # # This was how I did the pagination, but there's a far better way to do that in Flask: # total_posts = g.user.followed_posts().count() # first_page = None if total_posts <= POSTS_PER_PAGE else 1 # prev_page = None if page == 1 else page-1 # last_page = int(ceil(total_posts/POSTS_PER_PAGE)) # next_page = None if page == last_page else page + 1 posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False) # this is a Paginate object return render_template('index.html', title='Home', user=g.user, posts=posts, form=post_form)
def find_language(string): return guess_language(string)
def main(): args = parse_args() status = 0 for filename in args.files: if len(args.files) > 1: print(filename, file=sys.stderr) if filename == "-": filename = sys.stdin.fileno() encoding = args.encoding or (sys.stdin.encoding if sys.stdin.isatty() else locale.getpreferredencoding()) else: encoding = args.encoding or "utf-8" lang_tool = LanguageTool(motherTongue=args.mother_tongue) guess_language = None try: text = get_text(filename, encoding, ignore=args.ignore_lines) except UnicodeError as exception: print("{}: {}".format(filename, exception), file=sys.stderr) continue if args.language: if args.language.lower() == "auto": try: from guess_language import guess_language except ImportError: print("guess_language is unavailable.", file=sys.stderr) return 1 else: language = guess_language(text) if not args.api: print("Detected language: {}".format(language), file=sys.stderr) if not language: return 1 lang_tool.language = language else: lang_tool.language = args.language if not args.spell_check: lang_tool.disable_spellchecking() lang_tool.disabled.update(args.disable) lang_tool.enabled.update(args.enable) try: if args.api: print_unicode(lang_tool._check_api(text).decode()) elif args.apply: print_unicode(lang_tool.correct(text)) else: for match in lang_tool.check(text): rule_id = match.ruleId if match.subId is not None: rule_id += "[{}]".format(match.subId) replacement_text = ", ".join("'{}'".format(word) for word in match.replacements).strip() message = match.msg # Messages that end with punctuation already include the # suggestion. if replacement_text and not message.endswith((".", "?")): message += "; suggestions: " + replacement_text print_unicode( "{}:{}:{}: {}: {}".format(filename, match.fromy + 1, match.fromx + 1, rule_id, message) ) status = 2 except Error as exception: print("{}: {}".format(filename, exception), file=sys.stderr) continue return status
def parse_tweets(): db = sqlite3.connect(dbfile_) cursor = db.cursor() cursor.execute("PRAGMA journal_mode = WAL") if cursor.fetchone()[0] != "wal": print "Could not set journal_mode!" while(running_): tweet = queue_.get() # place tweet into a city if not tweet['place'] or not tweet['place']['bounding_box']: continue poly = asShape(tweet['place']['bounding_box']) city = None for c,p in AREAS.items(): if p.intersects(poly): city = c break if not city: city = "unknown" # parse the tweet for unique words and figure out the language tweet_text = tweet['text'].encode('utf-8') cleaned = [m.group() for m in (WORD_REGEX.match(t) for t in tweet_text.split()) if m] lang = guess_language(" ".join(cleaned)) words = [w.lower().translate(None, '.?!,\"').replace("'s",'') for w in cleaned] if words and lang == 'en': tstamp = int(time.time()) # save to sqlite db and to a set of files because we don't completely # trust the sqlite db to not be corrupted at some point cursor.execute("INSERT INTO tweets VALUES (?, (SELECT city_id FROM cities WHERE name=?), ?)", \ (tstamp, city, " ".join(cleaned))) for w in words: cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (w,)) cursor.execute("INSERT INTO word VALUES " \ "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", (tstamp, city, w)) if tweet['entities']['hashtags']: tags = [m.group().lower() for m in (HASH_REGEX.match(h['text']) for h in tweet['entities']['hashtags']) if m] for t in tags: cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (t,)) cursor.execute("INSERT INTO hash VALUES " \ "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", (tstamp, city, t)) if tweet['entities']['urls']: urls = [u['expanded_url'] for u in tweet['entities']['urls']] for u in urls: cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (u,)) cursor.execute("INSERT INTO link VALUES " \ "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", (tstamp, city, u)) db.commit() # done collecting, clean up db.commit() db.close()
# filters out tweets in non-english language __author__ = 'Marie' from guess_language import guess_language filename = 'C:\\Users\Marie\\Documents\\Project\\Data\\tweets2009-06.txt\\processed_tweets2009-06.txt' f = open(filename, 'r') out = open(filename + "_filtered", 'w') i = 0 for s in f: ssplitted = s.split(",.||") if (len(ssplitted) > 3 ): try: # if tweet is in eng, we output it to the final file. if (guess_language(unicode(ssplitted[3])) == 'en'): out.write(s + "\n") except Exception: pass i = i + 1 # track progress if (i % 10000 == 0): print i f.close() out.close()
# Retrieve the original utf-8 codification on the text and eliminate hashtags and cite # Inside the function calls the slang translation before eliminating marks utftext = put_readable(tweet['text'].decode('utf-8'), slg) # Count uppercases and marks tweet['uppercases'] = n_upper_chars(utftext) tweet['marks'] = n_marks_chars(utftext) # tweet['uppercases'] /= (float(len(utftext)) + 0.000001) # tweet['marks'] /= (float(len(utftext)) + 0.000001) # # Remove useless punctuation and put everything in lower case utftext = lower_punct(utftext) # Guess the language of the text and eliminate everything that is not English tw_lang = guess_language(utftext) if not (tw_lang == 'en'): archive_list.remove(tweet) else: #tweet['text_processed_unigrams'] = nltk.word_tokenize(utftext) tweet['text_processed_unigrams'] = utftext.split() tweet['text_processed_bigrams'] = nltk.bigrams(tweet['text_processed_unigrams']) utftext = " ".join(word for word in spell_correct(tweet['text_processed_unigrams'], Dict)) # Process remaining features and save them into a dictionary tweet['rawvulgarity'] = process_vulgarity(tweet['text_processed_unigrams'], pwl) tweet['vulgarity'] = process_insults(tweet['text_processed_unigrams'], pwl) tweet['unpoliteness'] = process_politeness(tweet['text_processed_unigrams'], tweet['text_processed_bigrams']) tweet['disagreement'] = process_vs(utftext)
def is_english(text): lngid_res = guess_language.guess_language(text.decode("utf-8")) return lngid_res == 'en'
def analyze(self, movie, attr_name='plot'): lang = str(guess_language(movie.attributes.get(attr_name) or '')) movie.analyzer_data[self.name] = lang