def __init__(self, lng_list=None): if lng_list is None: langid.set_languages(['en', 'ru']) else: langid.set_languages(lng_list) self.logger = logging.getLogger(__name__)
def language_filter(self): ''' Discarding sentence pairs whose detected languages are not it-de. Workaround for unsolved issue of langid (doesn't work with segments with only UPPERCASE characters): if all uppercase, convert to lowercase, then run langid. ''' tree = self.tree print("Removing TUs with wrong detected language...") langid.set_languages(['de', 'it']) nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"} root = tree.getroot() body = root.find("body") counter = 0 # for testing purposes for tu in root.iter("tu"): source_segment = tu.find(f"./tuv[@xml:lang='{LANG_PAIR[0]}']/seg", namespaces=nsmap).text target_segment = tu.find(f"./tuv[@xml:lang='{LANG_PAIR[1]}']/seg", namespaces=nsmap).text if source_segment.isupper() and target_segment.isupper(): source_segment = source_segment.lower() target_segment = target_segment.lower() try: detect_it = langid.classify(source_segment) detect_de = langid.classify(target_segment) except: continue if "de" in detect_it or "it" in detect_de: # broader alternative (every language other than it or de) => if "it" not in detect_it or "de" not in detect_de: if len(source_segment.split()) > 8 and len(target_segment.split()) > 8: # just considering longer segments, shorter ones are more likely to be false positives #print("\n\n", detect_it, detect_de, "\t", source_segment, "\n\t\t", target_segment) body.remove(tu) counter += 1 print("%i TUs removed" % counter)
def setup(self): ## subscribe to all receivers self.subscribeToAll() """ ## or pick which ones for k in self.allReceivers.keys(): self.subscribeTo(k) ## or subscribe to osc self.subscribeTo('osc') """ ## some variables self.queueDelay = 1 self.lastQueueCheck = time.time() ## turn up the volume subprocess.call("amixer set PCM -- -100", shell=True) ## for language identification langid.set_languages(['en','es']) ## for tagging input = open('uniTag.en.pkl', 'rb') self.enTagger = load(input) input.close() input = open('uniTag.es.pkl', 'rb') self.esTagger = load(input) input.close() self.tagDict = {}
def __init__(self, preferred_languages=None, preferred_factor=80.0, engine_preference=None, **config): self.preferred_languages = preferred_languages or [] self.preferred_factor = preferred_factor engine_preference = engine_preference or enumerate_engines() for ename in enumerate_engines(): if ename not in engine_preference: engine_preference.append(ename) self.engines = [] self.languages = set() for ename in engine_preference: try: options = config.get(ename, {}).get('options', {}) defaults = config.get(ename, {}).get('defaults', {}) eng = create_engine(ename, options=options, defaults=defaults) self.engines.append(eng) languages = config.get(ename, {}).get('languages', {}) for lang, conf in languages.items(): eng.configure(language=lang, **conf) except TTSError: pass for eng in self.engines: self.languages.update(eng.languages.keys()) langid.set_languages(self.languages) if not self.languages: raise TTSError('No supported languages')
def main(source, target, scores, output, from_language='de', to_language='en', num_threads=1, min_sent_length=3, max_sent_len_diff=15, number_url_ratio_threshold=0.6): langid.set_languages([from_language, to_language]) fr_stopwords = set(ds.stopwords.words(ds.lang_map[from_language])) to_stopwords = set(ds.stopwords.words(ds.lang_map[to_language])) with open(output, 'w') as fout, tqdm() as pbar: wc.server(helper, writer, reader(source, target, scores), num_threads=num_threads, output_stream=fout, fr_stopwords=fr_stopwords, to_stopwords=to_stopwords, pbar=pbar, min_sent_length=min_sent_length, max_sent_len_diff=max_sent_len_diff, number_url_ratio_threshold=number_url_ratio_threshold, from_language=from_language, to_language=to_language)
def split_by_language(reviews): """ Split the reviews based on their language. input arguments: reviews: a list of review items output arguments: reviews_dict_languages: a dictionary with languages as keys, and a list of the corresponding reviews as value. """ # Initialization reviews_dict_languages = {} langid.set_languages(language_list) # Use a counter to visualize the progress count = 1 # Loop over all reviews for review in reviews: # Detect the language language = langid.classify(review.content)[0] #Store the review in the corresponding dictionary by language if language in reviews_dict_languages: reviews_dict_languages[language].append(review) else: reviews_dict_languages[language] = [] reviews_dict_languages[language].append(review) return reviews_dict_languages
def classify_lang_langid(text): import langid # constrain the language set langid.set_languages(['en', 'zh', 'ru', 'ja', 'ko']) # e.g. ('zh', -19.244097471237183) return langid.classify(text)
def voice_things(local_voicedb, args): langs = args.langs or [] tts_func = None if args.engine: tts_engine = ENGINE_MAP[args.engine] tts_func = tts_engine.get_tts_func(args) if not langs: langs = tts_engine.legal_langs lang_map = {} for _lang in langs: lang_map[_lang[:2]] = _lang languages_to_set = [lang_id_code for lang_id_code in lang_map.keys()] langid.set_languages(languages_to_set) def classify_tts_lang(text): langid_code = langid.classify(text)[0] if 'zh' in languages_to_set: langid_code = fix_zh(langid_code, text) tts_lang = lang_map[langid_code] return tts_lang def local_voice_path(text, lang): tts_lang = lang log('try local voice: {}, {}'.format(repr(lang), repr(text))) voice_path = local_voicedb.get_path(text, tts_lang) if not voice_path and callable(tts_func): # The voice not in local log('not in local.', ) log('try TTS engine {} ... '.format(args.engine)) voice_data = tts_func(text, tts_lang) log('done!') tmp_file = tempfile.NamedTemporaryFile(delete=False) tmp_file_name = tmp_file.name tmp_file.close() with open(tmp_file_name, 'wb') as f: f.write(voice_data) local_voicedb.add_voice(tmp_file_name, text, tts_lang) os.remove(tmp_file_name) voice_path = local_voicedb.get_path(text, tts_lang) return voice_path return classify_tts_lang, local_voice_path
def langid_classifier(texts, lowercase=True, langs=['en', 'fr']): langid.set_languages(langs) if lowercase: detected_langs = [langid.classify(str(text).lower())[0] for text in texts] else: detected_langs = [langid.classify(str(text))[0] for text in texts] return detected_langs
def get_langid(str_text): """ When needed: identify language """ dic_lg = { "el":"Greek", "en":"English", "pl":"Polish", "ru":"Russian", "zh": "Chinese"}#TODO; more generic langid.set_languages(list(dic_lg.keys())) lang = dic_lg[langid.classify(str_text)[0]] return lang
def __init__(self, init_langid=True, line_postprocessor=None): ContentHandler.__init__(self) self.currentTag = None self.series = [] self.item = None if init_langid: langid.set_languages(langs=['en', 'ru', 'uk']) if line_postprocessor == None: self.line_postprocessor = RePostprocessor().process else: self.line_postprocessor = line_postprocessor
def __init__(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('content', type=unicode, location='json') self.reqparse.add_argument('lang', location='json') self.reqparse.add_argument('id', location='json') self.reqparse.add_argument('date', location='json') self.mordecai_ip = app.config['MORDECAI_ADDR'] self.mordecai_port = app.config['MORDECAI_PORT'] self.hypnos_ip = app.config['HYPNOS_ADDR'] self.hypnos_port = app.config['HYPNOS_PORT'] langid.set_languages(['ar', 'en']) super(HermesAPI, self).__init__()
def detect_language(texts, corpus_name="en_core_web_lg", langs=['en', 'fr']): """Detect the language of a list of texts. return detected languages""" # nlp = spacy.load(corpus_name) # nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) # detected_langs = [nlp(str(text))._.language['language'] for text in texts] # convert text into lowercase. langid.set_languages(langs) detected_langs = [langid.classify(text.lower())[0] for text in texts] return detected_langs
def extract(text: str, verbose: bool = False) -> datetime: langid.set_languages(['en', 'ru']) page = BeautifulSoup(text, "lxml") if verbose: print("Run site checkers...") _ = _check_sites(page, verbose) _date = _extract_of_site(page, _) if _ is not None else None if _date is not None: return _date if verbose: print("Extract tags from page") print('=' * 20) print(' ' * 20) print(' ' * 20) _date_lines = _extract_date_tags(page, TAGS_PATH, verbose) shuffle(_date_lines) if verbose: print(_date_lines) _date, _day = _extract_date_with_regex(_date_lines) if _date is not None: return _date for x in _date_lines: _date = reg_exp_datetime(x) if _date is not None: return _date _cal = Calendar(Constants("en")) funcs = [ _cal.parseDateText, _cal.parse, _cal.parseDT, ] key_minutes = 'minutes_hour' data = [_get_date_info(key_minutes, x, verbose) for x in _date_lines] result = _create_datetime(*_extract_day_time(data, key_minutes, [x.__name__ for x in funcs], verbose)) if verbose: print(' ' * 20) print(' ' * 20) print('=' * 20) return result
def eng_log_prob(directory='cleaned_paraphrases', sentences_file='sentences_200k.pkl', paras_file='paraphrases_all.pkl'): sentences = load_pkl(f'{directory}/{sentences_file}') all_para = load_pkl(f'{directory}/{paras_file}') langid.set_languages(['en']) eng_log_prob_sents, eng_log_prob_paras = [], [] for sent, paras in tqdm(zip(sentences, all_para)): eng_log_prob_sents.append(langid.classify(sent)[1]) eng_log_prob_paras.append([langid.classify(para)[1] for para in paras]) save_pkl(eng_log_prob_sents, 'stats/eng_log_prob_sents.pkl') save_pkl(eng_log_prob_paras, 'stats/eng_log_prob_paras.pkl')
def detect_language(text, langset=None) -> tuple: """Detect the language of the text. Return (lang, error). args: "langset" is the set of languages that the result should be limited to. return: "lang" will be a string containing an ISO 639-1 code. "error" will be an integer indicating a percentage. (Rounded to 2 digits) """ if langset: langid.set_languages(langset) language, confidence = langid.classify(text) error = round((1 - confidence) * 100, 2) return language, error
def detect_language(text, langset={}): """Detect the language of the text. Return (lang, error). args: "langset" is the set of languages that the result should be limited to. return: "lang" will be a string containing an ISO 639-1 code. "error" will be an integer indicating a percentage. (Rounded to 2 digits) """ if langset: langid.set_languages(langset) lang, confidence = langid.classify(text) error = round((1 - confidence) * 100, 2) return lang, error
def language_detect(self, fast=True, possible_languages=['es', 'en']): '''' Language identification of the texts: This process is more precise when using TextBlob. If fast=True, the language detection will be done using the langid module, which is faster but more imprecise ''' if fast: langid.set_languages(possible_languages) self.data['language'] = self.corpus[self.text].map( lambda x: langid.classify(x)[0]) return self.data['language'] else: self.data['language'] = self.corpus[self.text].map( lambda x: TextBlob(x).detect_language()) return self.data['language']
def language_identify_thres (self, msgs, lang_list, thres): nw = [] tally = 0 list_removed = [] for post in msgs: langid.set_languages(lang_list) out = langid.classify (post) out2 = list(out) if out2[0]=='en': nw.append(post) elif out2[1] > thres: nw.append(post) else: tally += 1 list_removed.append(tuple ([post, out2[0], out2[1]])) return nw, tally, list_removed
def get_body(soup): body_list = [] find_body = soup.find(class_="documentoesteso").find_all( re.compile("(p(?!a)|li|h2)")) # scraping for the body if find_body is not None: for p in find_body: for br in p.find_all("br"): br.replace_with("\n") # replacing </br> tags with newline body_line = p.get_text().strip( ) # getting text and stripping whitespaces if body_line: body_list.append(body_line) body_len = len(body_list) body = "\n".join(body_list) langid.set_languages(['de', 'it']) detected_lang = langid.classify(body) # detecting text language return body, detected_lang, body_len
def outputData(folder_name, comments, shares, status): langid.set_languages(['en','es']) output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for entry in comments: output_array[langCheck(entry["text"])] += 1 for entry in shares: output_array[langCheck(entry["owner_comment"]) + 4] += 1 for entry in status: output_array[langCheck(entry["message"]) + 8] += 1 output_string = folder_name for i in output_array: output_string += "," + str(i) print output_string
def get_langid_predictions(X): pred = [] print('Request started-Langid') langid.set_languages(['bg','es','ms','sr','bs','hr','pt','cs','id','mk','sk']) i = 0 for text in X: try: i = i + 1 (lang, confidence) = langid.classify(text) pred.append(lang) print('lang:', i, lang, text) if (i%50 == 0): print(i) except Exception as inst: print('Exception', i,inst, response) print('Request ended') return pred
def language_detection(text): language_set = ['en', 'fr', 'es', 'nl', 'it', 'pl', 'ro', 'pt'] langid.set_languages(['en', 'fr', 'es', 'nl', 'it', 'pl', 'ro', 'pt']) text = re.sub(r'[^\w\s]', '', text).strip() text = text.replace('\n', ' ') if text.strip(): try: lang = language_model.predict_lang(text) if lang in language_set: return lang else: return classify(text)[0] except: return classify(text)[0] # with GoogleTranslate(text) as output: # lang = output["language"] # if lang in language_set: # return lang return classify(text)[0]
def detect_lang(text_sequence, _type="whole"): # Only import langid if required. import langid langid.set_languages( ['ml', 'ta', 'bn', 'ur', 'hi', 'en', 'te', 'gu', 'pa', 'mr', 'or']) def _detect_segmented(text_sequence): warnings.warn("Detect segmented is not recommended." "This might lead to large slowdowns.") tokens = text_sequence.split() lang_assignments = [] for token in tokens: lang, prob = langid.classify(token) lang_assignments.append(lang) prev = None d_idxs = [] for i, lang in enumerate(lang_assignments): if lang != prev: d_idxs.append(i) prev = lang d_idxs.append(len(tokens)) ranges = zip(d_idxs, d_idxs[1:]) export = [] for l, u in ranges: segment = ' '.join(tokens[l:u]) tpl = (segment, lang_assignments[l]) export.append(tpl) return export def _detect_whole(text_sequence): lang, prob = langid.classify(text_sequence) return [(text_sequence, lang)] switch = {"whole": _detect_whole, "segmented": _detect_segmented} if _type not in switch: warnings.warn("Unknown type {}, defaulting to whole".format(_type)) return switch.get(_type, "whole")(text_sequence)
def sentiment_analysis(data): langid.set_languages(['en', 'ru']) lang = langid.classify(data['text'][0])[0] if lang == 'ru': labels = data['from'].unique() msg_df = data.loc[data.text != ''] messages_1 = list(msg_df.text[msg_df['from'] == labels[0]]) messages_2 = list(msg_df.text[msg_df['from'] == labels[1]]) tokenizer = RegexTokenizer() model = FastTextSocialNetworkModel(tokenizer=tokenizer) results_1 = model.predict(messages_1, k=2) sentiments_1 = [] for sentiment in results_1: # привет -> {'speech': 1.0000100135803223, 'skip': 0.0020607432816177607} # люблю тебя!! -> {'positive': 0.9886782765388489, 'skip': 0.005394937004894018} # малолетние дебилы -> {'negative': 0.9525841474533081, 'neutral': 0.13661839067935944}] tone = 0 if 'positive' in sentiment: tone += sentiment['positive'] if 'negative' in sentiment: tone -= sentiment['negative'] sentiments_1.append(tone) results_2 = model.predict(messages_2, k=2) sentiments_2 = [] for sentiment in results_2: tone = 0 if 'positive' in sentiment: tone += sentiment['positive'] if 'negative' in sentiment: tone -= sentiment['negative'] sentiments_2.append(tone) return sentiments_1, sentiments_2
def __init__(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument("content", type=unicode, location="json") self.reqparse.add_argument("lang", location="json") langid.set_languages(["ar", "en"]) super(HermesAPI, self).__init__()
from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from django.conf import settings from django.contrib.auth import get_user_model from django.core.cache import cache from .models import Profile, Message app = Celery('chat') channel_layer = get_channel_layer() User = get_user_model() chatbot = ChatBot(**settings.CHATTERBOT) langid.set_languages([code for code, _ in settings.LANGUAGES]) @app.task def update_user_statuses(): """ Task to update user online statuses via websockets. """ # Chat bot is always online. now = datetime.datetime.now() cache.set('seen_chatbot', now, settings.USER_ONLINE_TIMEOUT) async_to_sync(channel_layer.group_send)( 'users', { 'type': 'users.update', 'content': Profile.get_online_users() }
'--lang1', help='source language', dest='source_lang', default='en') parser.add_argument('-t', '--lang2', help='target language', dest='target_lang', default='fr') args = parser.parse_args() deletions = defaultdict(list) endCount = 0 totalCount = 0 langid.set_languages([args.source_lang, args.target_lang]) for line in args.infile: totalCount += 1 [url1, url2, source, target, score] = line.split("\t") langid_source = langid.classify(source.lower()) langid_target = langid.classify(target.lower()) if not source.strip(): deletions["source_empty"].append(source) elif not target.strip(): deletions["target_empty"].append(target) elif langid_source[0] != args.source_lang and langid_source[1] > 0.9: deletions["source_lang"].append( "%s\t%s\t%f" % (source, langid_source[0], langid_source[1])) elif langid_target[0] != args.target_lang and langid_target[1] > 0.9: deletions["target_lang"].append( "%s\t%s\t%f" % (target, langid_target[0], langid_target[1]))
def main(input_dir: str, output_dir: str, args: argparse.Namespace): input_files = glob.glob(f"{input_dir}/*") input_files.sort() for path in input_files: scheme = FileScheme(path, output_dir=output_dir) scheme.add_step(1, "extracted_texts") out_path = scheme.path(1) if not scheme.file_exists(out_path): extractor = TextExtractor.create_by_file_ext(path) extractor.extract(path, out_path) scheme.add_step(2, "manual_cleaning") copy_path = scheme.todo_path(2) done_path = scheme.done_path(2) if not (scheme.file_exists(copy_path) or scheme.file_exists(done_path)): scheme.copy_file(out_path, copy_path, create_dirs=True) if args.skip_manual_cleaning and not scheme.file_exists(done_path): scheme.copy_file(out_path, done_path, create_dirs=True) scheme.add_step(3, "cleanup_whitespace") out_path = scheme.path(3) if not scheme.file_exists(out_path): done_path = scheme.done_path(2) if not scheme.file_exists(done_path): print( "WARN:", f"No input file at: '{done_path}'. Was this file manually cleaned?" ) continue lines = scheme.read_lines(scheme.done_path(2)) lines = cleaning.cleanup_whitespace(lines) scheme.write_lines(out_path, lines, create_dirs=True) # detect the document language language_code = "" if scheme.file_exists(scheme.path(3)): content = scheme.read_file(scheme.path(3)) langid.set_languages(_project_languages.keys()) language_code, _ = langid.classify(content) scheme.add_step(4, "de_hyphenate") out_path = scheme.path(4) if not scheme.file_exists(out_path): lines = scheme.read_lines(scheme.path(3)) lines = cleaning.remove_end_of_line_hyphens( lines, language_code, args.always_combine_hyphens) scheme.write_lines(out_path, lines, create_dirs=True) scheme.add_step(5, "tokenize_sententces") out_path = scheme.path(5) if not scheme.file_exists(out_path): content = scheme.read_file(scheme.path(4)) tokenizer = sentence_tokenizer(_project_languages[language_code]) sentences = tokenizer.tokenize(content) # since no hyphen should exist at this point, we can just cat the lines together sentences = map(lambda s: re.sub(r"\s+", " ", s), sentences) sentences = map(str.strip, sentences) scheme.write_lines(out_path, sentences, create_dirs=True) scheme.add_step(6, "escape_xml_chars") out_path = scheme.path(6) if not scheme.file_exists(out_path): content = scheme.read_file(scheme.path(5)) new_content = cleaning.escape_xml_chars(content) scheme.write_file(out_path, new_content, create_dirs=True) scheme.add_step(7, "sentence_window") out_path = scheme.path(7) if args.random_window > 0 and not scheme.file_exists(out_path): lines = scheme.read_lines(scheme.path(6)) if len(lines) > args.random_window: begin = random.randrange(0, len(lines) - args.random_window) lines = lines[begin:(begin + args.random_window)] scheme.write_lines(out_path, lines, create_dirs=True) else: # reset for the last step out_path = scheme.path(6) scheme.add_step(42, "separate_by_language") path_from_last_step = out_path language_dir = os.path.join(scheme.dirname(42), language_code) os.makedirs(language_dir, exist_ok=True) scheme.copy_file(path_from_last_step, language_dir)
def detect_language(self, str): """ Detect language of given string. First, attempt to detect language with langid. Langid uses probabilities, so if probability score is low, use characters in string to detect language. :param str str: The given string :return: language in 2 letter ISO format. :rtype: str """ # If passed an empty string, return empty result if len(str.strip()) < 1: return '' str_langid = str.strip() str = str.lower().strip() detected_langid_value = '' langid.set_languages(self.langid_languages) # langid.set_languages(['de', 'fr', 'it']) detected_lang = self.langid_identifier.classify(str_langid) # print(detected_lang) # if detected_lang[1] < 0.5: # print(detected_lang) # print(str_langid) ''' If the statistical probability of a detected language is larger than 0.9999, return that language, if it is included in langid_languages. If not, set it equal to 'en'. ''' if detected_lang[1] > 0.9999 and detected_lang[0] in self.langid_languages: detected_langid_value = detected_lang[0] return detected_langid_value # else: # detected_langid_value = 'en' # Greek characters chars_el = set('αβγδεζηθικλμνξοπρσςτυφχψω') # Latin characters chars_en = set('abcdefghijklmnopqrstuvwxyz') # French characters chars_fr = set('éàèùâêîôûçëïü') # German characters chars_de = set('äöüß') # Turkish characters chars_tr = set('şĞİğı') # Spanish characters chars_es = set('ñóíáã') # Slovak characters chars_sk = set('ýúčžň') # Czech characters chars_cz = set('řťšůď') ''' If a greek character exists, return greek language immediately ''' if any((c in chars_el) for c in str): return 'el' return_value = '' # if 'LATIN' in unicodedata.name(str.strip()[0]): if any((c in chars_en) for c in str): if any((c in chars_fr) for c in str): return_value = 'fr' if any((c in chars_de) for c in str): return_value = 'de' if any((c in chars_tr) for c in str): return_value = 'tr' if any((c in chars_es) for c in str): return_value = 'es' if any((c in chars_sk) for c in str): return_value = 'sk' if any((c in chars_cz) for c in str): return_value = 'cz' return_value = 'en' ''' If no language is detected, return an empty string. This helps set DC values with no language. xstr = lambda s: s or "" ''' return return_value
import json import logging from flask import Blueprint, jsonify, request import langid from transformers import MarianTokenizer, MarianMTModel from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) mt_helsinki_nlp = Blueprint('mt_helsinki_nlp', __name__, template_folder='templates') SERVICE_NAME_COMPONENT = os.environ['SERVICE_NAME_COMPONENT'] supported_langs = ['ru', 'es', 'de', 'fr'] langid.set_languages(supported_langs) models = { lang: MarianMTModel.from_pretrained( 'Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs } tokenizers = { lang: MarianTokenizer.from_pretrained( 'Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs } @mt_helsinki_nlp.route("/annotatequestion", methods=['POST']) def qanary_service(): """the POST endpoint required for a Qanary service"""
from urllib.parse import urlsplit from DomainFinderSrc.Utilities import FilePath, FileIO from unittest import TestCase from DomainFinderSrc.Scrapers.LinkChecker import LinkChecker import shortuuid from DomainFinderSrc.Utilities.StrUtility import StrUtility import langid # https://zh.wikipedia.org/wiki/ISO_639-1 good_lang_code = ['en', 'de', 'fr', 'es', 'fr', 'it', 'pt', 'ga', 'el', 'da'] bad_lang_code = ['zh', 'ja', 'ko', 'ru', 'vi'] langid.set_languages(langs=good_lang_code+bad_lang_code) class LanguageTest(TestCase): def test_from_text(self): result = langid.classify('i made an icecream for you.') print(result) def test_from_url(self): response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848") print(langid.classify(response.text))
import json from itertools import islice import langid import sys langid.set_languages(["nl", "en"]) for line_no, line in enumerate(islice(open(sys.argv[1]), None)): if line_no > 0: if line_no % 1000 == 0: print("%s" % (line_no), file=sys.stderr) elif line_no % 100 == 0: print(".", end=" ", file=sys.stderr) try: user = json.loads(line) except ValueError: continue uid = user.get("user_id", None) reviews = user.get("reviews", []) if reviews: counter = 0 for rn, review in enumerate(reviews, 1): text = " ".join(["".join(sent) for sent in review.get("text", [])]) review["langid"] = langid.classify(text)[0] print(text) print(json.dumps(user, ensure_ascii=False))
from typing import List from fastapi import APIRouter from pydantic import BaseModel import langid langid.set_languages(['de', 'en']) # ISO 639-1 codes # # not a good idea to work with global variables like this. # from backend import api DB_INDEX_AUTOCOMPLETE = "autocomplete" router = APIRouter() class Request(BaseModel): search: str def addQuestionToAutocomplete(question: str): # todo: if it already exists; we need to increment count; body = {'phrase': question, 'count': 1} res = api.elasticsearch_client.index(index=DB_INDEX_AUTOCOMPLETE, body=body) @router.get("/query/autocomplete") def ask(search: str):
# This parses the EC declassified archives and creates a single file per language present in the original files. # Simon Hengchen - [email protected] - http://homepages.ulb.ac.be/~shengche import codecs import re import io import os,glob import langid from multiprocessing import Pool from langid.langid import LanguageIdentifier, model identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) langid.set_languages(['en','it','fr','nl','de','da','el']) # we restrict the possible languages to the official ones of the EU between 1958 and 1982, scope of our corpus def get_path(dossier): return (os.path.join(dossier, f) for f in os.listdir(dossier) if 'txt' in f) def langiding(chemin): f = io.open(chemin,"r",encoding="utf8") fen = io.open(chemin+"_en","w",encoding="utf8") # one file for each language fit = io.open(chemin+"_it","w",encoding="utf8") ffr = io.open(chemin+"_fr","w",encoding="utf8") fnl = io.open(chemin+"_nl","w",encoding="utf8") fde = io.open(chemin+"_de","w",encoding="utf8")
import re, codecs, os, sys, getopt #import urllib,urllib2 #import json from bs4 import BeautifulSoup as bs from collections import defaultdict #from jellyfish import levenshtein_distance as ld #from nltk.tokenize import wordpunct_tokenize import langid langid.set_languages(['en','nl','fr']) def get_files(folder): listbase = 'corpus/'+folder+"/" listall = os.listdir(listbase) listing = [f for f in listall if len(f) in [22,23,24]] return (listbase,listing) folders = os.listdir('corpus/') #folders = [] #output = codecs.open("output.txt",'a',encoding='utf-8') #gsc = codecs.open("gsc2.txt").readlines() #gs = [l.split("\t") for l in gsc] for folder in sorted(['TYT']): #dic = defaultdict(lambda:0) listbase,listing = get_files(folder) for f in sorted(listing): ff = listbase+f text = open(ff).read() soup = bs(text,"xml")
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin) parser.add_argument("outfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout) parser.add_argument("-minscore", type=float, default=0, help="minimum score from hunalign") parser.add_argument("-s", "--lang1", help="source language", dest="source_lang", default="en") parser.add_argument("-t", "--lang2", help="target language", dest="target_lang", default="fr") args = parser.parse_args() deletions = defaultdict(list) n_written = 0 n_total = 0 langid.set_languages([args.source_lang, args.target_lang]) for line in args.infile: n_total += 1 source, target, score = line.split("\t") if float(score) < args.minscore: deletions["low score"].append("") if source == target: deletions["identical"].append(target) continue if not source.strip(): deletions["source_empty"].append("") continue elif not target.strip(): deletions["target_empty"].append("") continue
import pandas as pd, pycld2 as cld2, re, langid from sqlalchemy import create_engine from bs4 import BeautifulSoup from tqdm import tqdm langid.set_languages(['ru','uk','en']) with open('../psql_engine.txt') as f: psql = create_engine(f.read()) def get_lang(text): rel, _, matches = cld2.detect(text) if not rel: return matches = list(filter(lambda m: m[1] in ['ru', 'uk', 'en'], matches)) if len(matches) == 0: return langid.classify(text)[0] return matches[0][1] chunks = pd.read_sql('''SELECT html_id, ra_summary FROM htmls WHERE lang isnull and ra_summary notnull; ''', psql, chunksize=20000) for df in tqdm(chunks): df['text'] = df.ra_summary.apply(lambda s: re.sub('\s+', ' ', BeautifulSoup(s, 'lxml').get_text()).strip()) df['text'] = df.text.apply(lambda t: ''.join([ch for ch in t if ch.isprintable()])) df['lang'] = df.text.apply(get_lang) vals = ',\n'.join([f"({html_id}, '{lang}')" for html_id, lang in df.loc[pd.notnull(df.lang)].reindex(['html_id', 'lang'], axis=1).values]) psql.execute(f'''
for lsfile in sfile: lsfile = lsfile.rstrip() remlist.append(lsfile) if args.regexFromFile: regfile = codecs.open(args.regexFromFile, "r", encoding="utf-8") reglist = [] for lsfile in regfile: lsfile = lsfile.rstrip() reglist.append(lsfile) if args.vSetLanguages: toset = [] for l in args.vSetLanguages.split(","): toset.append(l) langid.set_languages(toset) for linia in entrada: toWrite = True linia = linia.strip() camps = linia.split("\t") if len(camps) >= 1: slsegment = camps[0] tlsegment = "" if len(camps) >= 2: tlsegment = camps[1] if args.unescape_html and toWrite: slsegment = unescape_html(slsegment) tlsegment = unescape_html(tlsegment) if args.remove_tags and toWrite:
def language_id(content): langid.set_languages(['en', 'zh']) return langid.classify(content)
d['name_lower'] = d.name.apply(capital_encode) #d['name_lower'] = d.name return d if os.path.exists('lang_train_cache.csv'): train = pandas.DataFrame.from_csv('lang_train_cache.csv') test = pandas.DataFrame.from_csv('lang_test_cache.csv') else: print 'can not load from cache' d = load_twitter('data/lang/new_lang_data.txt.gz') lang_counts = d.groupby('lang')['lang'].agg('count') langs = set(lang_counts[lang_counts.values > 10000].index) d = d[d.lang.apply(lambda x: x in langs)] # use only big languages langid.set_languages(langs) langid_labels = [] langid_scores = [] for i, idx in enumerate(d.index): if i % 10000 == 0: print i langid_label, langid_score = langid.classify(d.text[idx]) langid_labels.append(langid_label) langid_scores.append(langid_score) d['lid_label'] = langid_labels d['lid_score'] = langid_scores d = d[(d.lid_score > 0.995) & (d.lang == d.lid_label)] # random partioning
def langidFeature(word): import langid langid.set_languages(['zu','en']) return langid.classify(word)[0]
try: import langid LANGID_FLAG = True except ImportError: LANGID_FLAG = False from .lru import LRUCache from .settings import (DETECTION_LANGUAGES, LRU_SIZE, MAX_REPETITIONS, MIN_DUPLCHECK_SIZE) LOGGER = logging.getLogger(__name__) LRU_TEST = LRUCache(maxsize=LRU_SIZE) if LANGID_FLAG is True: langid.set_languages(DETECTION_LANGUAGES) RE_FILTER = re.compile( r'\W*(Facebook|Twitter|Google|Linkedin|Whatsapp|Xing|Instagram|Pinterest|PDF|E-Mail|Drucken)$', flags=re.IGNORECASE) # |.hnliche Beitr| Instagram # (r'\W*(Gef.llt mir|[Ss]hare (on|via)|Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s)', line) or def put_in_cache(body): '''Implement LRU cache''' global LRU_TEST for element in body: try: teststring = ' '.join(element.itertext()) except AttributeError: # justext Paragraph
__date__ = "$30-jul-2015 12:13:01$" import langid import sys import codecs if (len(sys.argv) > 1): name = sys.argv[1] else: print "no parametros" corrects = 0 total = 0 langid.set_languages(['es','fr','it','en','pt','ro']) if (name): f = open(name,"r") for line in f: text,lang = line.split(";") lang = lang.strip() total = total + 1 ident,conf = langid.classify(text) ident = ident.strip() if (lang == ident): corrects = corrects + 1 print "Aciertos ",corrects,"de",total,"Porcentaje",(float(corrects) / float(total))*100
def get_lang(words): langid.set_languages(['it', 'en', 'de', 'fr', 'es', 'ja']) array = langid.classify(words) lang = array[0] return lang
__author__ = 'alex' from forms import * from django.template import RequestContext from django.shortcuts import render_to_response from bs4 import BeautifulSoup import langid import wikipedia_updated supported_prefixes = [prefix[0] for prefix in SUPPORTED_PREFIXES] print 'supported_prefixes: ' + str(supported_prefixes) langid.set_languages(supported_prefixes) def searchView(request): if ('search_query' in request.GET) and request.GET['search_query'].strip(): search_query = request.GET['search_query'] if ('prefix' in request.GET) and request.GET['prefix'].strip(): prefix = request.GET['prefix'] print BeautifulSoup(search_query,from_encoding="utf-8") detected_langid = langid.classify(search_query)[0] print "Detected Lang: %s" % detected_langid print "Translate To: %s" % prefix if prefix == 'en' and detected_langid == 'en': detected_langid = 'de' if prefix == 'de' and detected_langid == 'de': detected_langid = 'en' translationFound = False wikipedia_updated.set_lang(detected_langid)
''' Created on 05-Nov-2015 @author: unni ''' import langid from db_manager import connect_to_db host_name = "10.5.23.213" user_name = "root" password = "******" def find_lang(db_name, table): cursor_mysql = connect_to_db(host_name, user_name, password) cursor_mysql.execute("select distinct tweetText from %s.%s"%(db_name,table)) lang_dict = {'hi':0, 'en':0} for row in cursor_mysql.fetchall(): lang = langid.classify(row[0]) lang_dict[lang[0]]+=1 print lang_dict if __name__ == '__main__': langid.set_languages(['hi', 'en']) #find_lang('Events', 'BigBillionDay') print langid.classify('yeh hindi hein')
| \.\.\. | [][.,;"'?():-_`] ''' tokenized_sents = [] print '*** 2. Tokenization' print 'Tokenization in progress ...' for sent in sents: tokenized_sent = nltk.regexp_tokenize(sent, token_pattern) tokenized_sents.append(tokenized_sent) #print(tokenized_sents) # 3. Spracherkennung (evtl. sogar satzweise moeglich!) print '*** 3. Language recognition (not in use)' print_i= 1 langid.set_languages(['en', 'de']) detected_language = '' language_matching = [] for sent in sents: detected_language = langid.classify(sent) #print(print_i, sent, detected_language) language_matching.append([print_i, sent, detected_language[0]]) print_i += 1 print(language_matching) # 4. Normalisierung (lowercase, stopwords, spellcheck, thesauri) print '*** 4. Normalization (lowercase, stopwords, spellcheck)' stopwords_en = nltk.corpus.stopwords.words('english') # Definition der stopword Corpora stopwords_de = nltk.corpus.stopwords.words('german') print 'Stripping the stopwords ...' for sent in sents: