def __init__(self, data_path, data_files: list, paper_set_file=None): self.data_files, self.data_path = data_files, data_path if paper_set_file is not None: self.paper_set = utils.get_paper_set(paper_set_file) identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) self.lang = lambda s: identifier.classify(str(s))
def tag_lang(data, txt_var='text_clean'): """ Tag language in all text in data, return language, score and post IDs. :param data: data frame :param txt_var: text var :returns lang_id_data:: lang, score and post ID """ lang_id_model = LanguageIdentifier.from_modelstring(model, norm_probs=True) # parallel MAX_JOBS=5 pandarallel.initialize(nb_workers=MAX_JOBS) lang_score_vals = data.loc[:, txt_var].parallel_apply(lang_id_model.classify) # serial # TODO: why does langid wreck CPU use? # lang_score_vals = data.loc[:, txt_var].apply(lang_id_model.classify) # separate lang/score lang_val, lang_score = zip(*lang_score_vals) lang_var = 'lang' lang_score_var = 'lang_score' post_id_var = 'id' data = data.assign(**{ lang_var : lang_val, lang_score_var : lang_score, }) lang_id_data = data.loc[:, [lang_var, lang_score_var, post_id_var]] return lang_id_data
def langid_pred(inputText): # https://github.com/saffsd/langid.py identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) output = identifier.classify(inputText) confidence = output[1] pred = output[0] return inputText, output, confidence, clean_output(pred)
def __init__(self): self.cli() self.infiles = self.get_files(self.indir, self.pattern) self.n_proceedings = 0 self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) self.main()
def get_language(paragraphs): ''' Fonction to get the sentences of a file text. Input: file text with one sentence per line. Output: list of the sentences ''' identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) count_languages = {'fr': 0, 'en': 0, 'es': 0} total_count = 0 for p in paragraphs: infos_language = language_detect(p, identifier) if infos_language[1] >= 0.7: count_languages[infos_language[0]] += float(infos_language[1]) total_count += 1 else: del p probability_max = 0 initials = '' for k in count_languages.keys(): if count_languages[k] / total_count > probability_max: probability_max = count_languages[k] / total_count initials = k logger.debug( f'Text in {initials} with {probability_max*100}% of confidence') return initials
def are_words_valid(clean_words: List[str], english_word_count: int, remove_english: bool, use_langid: bool) -> bool: """ Determines whether a list of words is valid based on the provided parameters. :param clean_words: a list of clean word strings. :param english_word_count: the number of english words removed from the string during cleaning. :param remove_english: whether or not to remove english words. :param use_langid: whether or not to use the langid library to determine if a word is English. :return: True if utterance is valid, False otherwise. """ # Exclude utterance if empty after cleaning cleaned_transcription = " ".join(clean_words).strip() if cleaned_transcription == "": return False # Exclude utterance if > 10% english if remove_english and len(clean_words) > 0 and english_word_count / len(clean_words) > 0.1: # print(round(english_word_count / len(clean_words)), trans, file=sys.stderr) return False # Exclude utterance if langid thinks its english if remove_english and use_langid: langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) lang, prob = langid_identifier.classify(cleaned_transcription) if lang == "en" and prob > 0.5: return False return True
def __init__(self, src_lang, tgt_lang, threshold=0.8): self.src_lang = src_lang self.tgt_lang = tgt_lang self.identifier = LanguageIdentifier.from_modelstring(m, norm_probs=True) self.identifier.set_languages([src_lang, tgt_lang]) self.threshold = threshold
def __init__(self): self.active = settings.ANALYZE_LANGUAGE if self.active: from langid.langid import LanguageIdentifier as lid, model self.identifier = lid.from_modelstring(model, norm_probs=True) langs = set(settings.LANGUAGES) langs = langs.intersection(self.identifier.nb_classes) self.identifier.set_languages(langs)
def getLanguages(data): res = dict() identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) for filename, datum in data.items(): lang, prob = identifier.classify(datum) res[filename] = pycountry.languages.get( alpha_2=lang) return res
def is_chinese2(content): if content: identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) result = identifier.classify(content) language, score = result if language =='zh' and score > 0.7: return True else: return False
def transform_df(df): ''' dataframe ''' print("transform_df called for partition...") identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) #takes 2 seconds out = pd.DataFrame(columns=["text","sentiment"]) preprocessor = Preprocessor() out["text"] = df["text"].apply(func=preprocessor.clean_text,args=(identifier,)) out["sentiment"] = df["stars"].map(preprocessor.map_rating) return out
def load_langid_model(model_path: Optional[str], lang_set: Sequence[str]) -> LanguageIdentifier: """ Loads the provided langid.py model. If none provided, then it loads the default model. :param model_path: path to model to load :param lang_set: language set to which the model should be restricted. Provide empty list for no restrictions. :return: language identifier """ if model_path is None: from langid import langid langider = LanguageIdentifier.from_modelstring(langid.model, norm_probs=True) else: langider = LanguageIdentifier.from_modelpath(model_path, norm_probs=True) if len(lang_set) > 0: langider.set_languages(langs=lang_set) return langider
def __init__(self, src_lang=None, tgt_lang=None, src_threshold=0, tgt_threshold=0, **kwargs): if not (isinstance(src_lang, str) and isinstance(tgt_lang, str)): logging.error("Both source and target languages need to be defined") raise ValueError("Strings expected, got: %s %s" % (src_lang, tgt_lang)) self.src_lang = src_lang self.tgt_lang = tgt_lang self.src_threshold = src_threshold self.tgt_threshold = tgt_threshold self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) super().__init__(**kwargs)
def Languages(base_path, photo): photo_folder = os.path.join(base_path, photo) num_iterations = len([ fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder, fol)) and "source" not in fol ]) start_iter = 1 range_iter = [str(i) for i in list(range(1, num_iterations + 1))] folder_base = os.path.join(base_path, photo, photo) print('INFO: Scraping Languages for photo {}'.format(photo)) identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) language_dict = {} for iteration in range_iter: if os.path.isdir( os.path.join(folder_base + "_" + str(iteration) + "/txt")) == False: print('INFO: No texts in this iteration, stopping..') continue language_dict.update({str(iteration): {}}) list_json = [ js for js in os.listdir( os.path.join(base_path, photo, photo + "_" + str(iteration), "txt")) if ".json" in js ] for js in list_json: with open( os.path.join(base_path, photo, photo + "_" + str(iteration), "txt", js)) as f: json_content = json.load(f) for id_, text in json_content.items(): language_score = identifier.classify(str(text)) print(language_score) language_dict[str(iteration)].update( {id_: [language_score[0], language_score[1]]}) # Write Detected Languages to language.json print("INFO: Writing detected languages to {}".format( os.path.join(photo, 'languages.json'))) with open( os.path.join(base_path, photo, 'languages-{}.json'.format(photo)), 'w') as fp: json.dump(language_dict, fp)
def identifier(self): cls = type(self) if not hasattr(cls, '_id'): # https://github.com/saffsd/langid.py from langid.langid import LanguageIdentifier as lid, model cls._id = lid.from_modelstring(model, norm_probs=True) if len(settings.LANGUAGES): langs = set(settings.LANGUAGES) langs = langs.intersection(cls._id.nb_classes) cls._id.set_languages(langs) return cls._id
def convert_to_metafeatures(training_data): result = [] lang_identifier = LanguageIdentifier.from_modelstring(model) for i, chunk in enumerate(training_data): langid_features = np.array([lang_identifier.instance2fv(tweet) for tweet in chunk]) num_occurrences = langid_features.sum(axis=0) averages = [occurrences/len(chunk) for occurrences in num_occurrences] sparseness = (langid_features > 0).sum(axis=0) result.append(np.array(averages + sparseness)) if i % 50 == 0: print('Converted {} inputs to meta-features...'.format(i)) result = np.array(result) return result
def run_main(self): if not self.FETCH: print "Use option --fetch to fetch new twitter streams. Currently using tweet streams from folder tweet_archive/" self.solution_file = open(self.SOLUTION_FILE, "w+") self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) if self.FETCH: print "Fetching new Twitter stream." self.fetch_stream_to_file(15000, self.GENERAL_TWEETS_FILE) self.process_tweets_in_file(self.GENERAL_TWEETS_FILE) self.local_tweet_analysis() self.additional_analysis()
def set_languages(self, langs): try: from langid.langid import LanguageIdentifier, model except ImportError: print('Please install package of langid') self.langid = LanguageIdentifier.from_modelstring( model, norm_probs=True) try: self.langid.set_languages(langs) except ValueError: self.langid.set_languages(['en'])
def extract_languages(self): from langid.langid import LanguageIdentifier, model identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) count = 0 for review in self.reviews: result = identifier.classify(review[1]) count = count + 1 print("#%d: lang & accuracy: %s, content: %s" % (count, result, review[1])) if self.save_to_db: self.parser.add_new_lang_feature(review[0], result[0], result[1])
def _get_p_language(self): """ Returning and setting the language of the paste (guessing) :Example: PST._get_p_language() ..note:: The language returned is purely guessing and may not be accurate if the paste doesn't contain any human dictionnary words ..seealso: [email protected]:saffsd/langid.py.git """ identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) return identifier.classify(self.get_p_content())
def __init__( self, language_scope: List = supported_languages_dict.keys(), minimum_score: float = 0.0, fallback_language: AnyStr = "", ): self.language_scope = language_scope self.minimum_score = float(minimum_score) self.fallback_language = fallback_language self.column_description_dict = self.COLUMN_DESCRIPTION_DICT # may be changed by detect_languages_df self._langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) self._langid_identifier.set_languages( [l for l in self.language_scope if l not in SUPPORTED_LANGUAGES_IN_CLD3_NOT_IN_LANGID] )
def __init__( self, language_scope: List = SUPPORTED_LANGUAGES_PYCLD3.keys(), minimum_score: float = 0.0, fallback_language: AnyStr = "", ): store_attr() self.column_descriptions = self.COLUMN_DESCRIPTIONS.copy( ) # may be changed by detect_languages_df self._langid_identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) self._langid_identifier.set_languages([ l for l in self.language_scope if l not in SUPPORTED_LANGUAGES_PYCLD3_NOT_LANGID ])
def removeUnicodeAndLangId(data): identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) i = 0 j = 0 copy = {} for d in data: listTweet = [] for tweet in data[d]: tweet = tweet.encode('ascii', 'ignore').decode("utf-8") lang = (identifier.classify(tweet))[0] if lang == "en": listTweet.append(tweet) if not len(listTweet) == 0: copy[d] = listTweet
def is_defective_pp(clean_pp): """ Checks if a pp is defective, eliminate javascript and non-english pp - using nlp language detection :param clean_pp: clean pp :return: True if defective, otherwise False """ low_text = clean_pp.lower() identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) language_detect = identifier.classify(clean_pp) if low_text is None or language_detect[0] != "en" or (language_detect[0] == "en" and language_detect[1] < 0.9) or \ 'privacy' not in low_text or 'class=' in low_text or 'function(' in low_text or \ 'function (' in low_text or 'catch(' in low_text or 'exception(' in low_text \ or '{' in low_text: return True else: return False
def set_english(df, text_column): ''' set_english: takes in a string of a text column outputs the predicted languge of that string to a list which is used to mask a pandas DataFrame. Parameters ---------- doc: Python string Returns ------- doc: Python list with predicted language ''' identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) lst = [] for i in df[text_column].values: lang, score = identifier.classify(i) lst.append(lang) return lst
def detectar_lenguaje(texto, devolver_proba=False): """ Identifica el lenguaje en el que está escrito el texto de entrada. :param texto: Texto de entrada. :type texto: str :param devolver_proba: Indica si se retorna el porcentaje de \ confiabilidad del lenguaje identificado. Valor por \ defecto `False`. :type devolver_proba: bool, opcional :return: (str) Texto del lenguaje identificado siguiendo el estandar \ `ISO 639-1 <https://es.wikipedia.org/wiki/ISO_639-1>`_. \ Si `devolver_proba = True` retorna una tupla. """ identificador = LanguageIdentifier.from_modelstring(model, norm_probs=True) if devolver_proba: return identificador.classify(texto) else: return identificador.classify(texto)[0]
def gatherHashtag(data): data = json.load(data) identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) usrDict = {} numbHashtag={} numHashtag=0 i=0 j=0 for key in data: try: usrDict[key['id']]=[] tmp = api.GetUserTimeline(user_id=key['id'], count=200) for t in tmp: hashtags=t.hashtags numHashtag+=len(hashtags) for h in hashtags: hashtag=h.text hashtag=hashtag.encode('ascii', 'ignore').decode("utf-8") lang=(identifier.classify(hashtag))[0] if lang=="en": usrDict[key['id']].append(hashtag) meanHashTag=numHashtag/200 numHashtag=0 numbHashtag[key['id']]=meanHashTag except Exception as e: print(e) with open('logs/log.txt', 'a') as log: log.write(str(e)) log.write(str(key['id'])) log.write("\n") with open('data/twitter_Hashtag.json', 'w+') as tweetsFile: json.dump(usrDict, tweetsFile) with open('data/numb_Hashtag.json', 'w+') as tweetsFile: json.dump(numbHashtag, tweetsFile)
def get_country_by_language(text): """Get the country which speak the language of the given text. Args: text: the text. Returns: A list of country """ identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) lang = identifier.classify(text)[0] countries_matching = [] with open('./data/country.csv', 'rb') as csvfile: countries = csv.reader(csvfile, delimiter=';', quotechar='|') for country in countries: if lang in country[51].replace(' ', '').split(','): countries_matching.append(country[4].strip()) logger.info(u'Matched text with countries: {0}'.format(countries_matching)) return countries_matching
def detect(self, text): if not LanguageDetector.identifier: LanguageDetector.identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) language_abbrs = [ lang for lang, prob in LanguageDetector.identifier.rank(text) if prob > self.DETECT_THRESHOLD_PROB ][:self.DETECT_THRESHOLD_LEN] if not language_abbrs: raise self.BrokenRequest( 'UNSUPPORTED_LANGUAGE_DETECTED', data={'text': text}, is_critical=True) return [ self.languages_index[abbr] for abbr in language_abbrs if abbr in self.languages_index]
def clean_json_data(json_data: List[Dict[str, str]], remove_english: bool = False, use_langid: bool = False) -> List[Dict[str, str]]: """ Clean a list of utterances (Python dictionaries) based on the given parameters. :param json_data: list of Python dictionaries, each must have a 'transcription' key-value. :param remove_english: whether or not to remove English from the utterances. :param use_langid: whether or not to use the langid library to identify English to remove. :return: cleaned list of utterances (list of dictionaries). """ punctuation_to_remove = string.punctuation + "…’“–”‘°" special_cases = ["<silence>"] # Any words you want to ignore langid_identifier = None if remove_english: english_words = get_english_words() # pre-load English corpus if use_langid: langid_identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) else: english_words = set() cleaned_data = [] for utterance in json_data: clean_words, english_word_count = clean_utterance( utterance=utterance, remove_english=remove_english, english_words=english_words, punctuation=punctuation_to_remove, special_cases=special_cases) if is_valid_utterance(clean_words, english_word_count, remove_english, use_langid, langid_identifier): cleaned_transcript = " ".join(clean_words).strip() utterance["transcript"] = cleaned_transcript cleaned_data.append(utterance) return cleaned_data
def compare_language_distribution(tweet_list): agreement_dic = {} disagreement_dic = {} guess_dic = {} pos_conf = [] neg_conf = [] clf = LanguageIdentifier.from_modelstring(model, norm_probs=True) for tweet in tweet_list: language = tweet['lang'] if language == 'und': continue text = str(tweet['text']) #print(text) text = " ".join(map(str, tw.tokenizeRawTweetText(text))) #print(text) guess = clf.classify(text) if guess[0] in guess_dic: guess_dic[guess[0]] += 1 else: guess_dic[guess[0]] = 1 if guess[0] == language: pos_conf.append(guess[1]) if language in agreement_dic: agreement_dic[language] += 1 else: agreement_dic[language] = 1 else: neg_conf.append(guess[1]) if language in disagreement_dic: disagreement_dic[language] += 1 else: disagreement_dic[language] = 1 print('The agreed inferences had a confidence of ' + str(sum(pos_conf) / len(pos_conf)) + '. The disagreed inferences had a confidence of ' + str(sum(neg_conf) / len(neg_conf))) return agreement_dic, disagreement_dic, guess_dic
def setup_pass_langid(model_path): global __identifier print "setting up an identifier" __identifier = LanguageIdentifier.from_modelpath(model_path)
def identifier(self): if not hasattr(LanguageAnalyzer, '_identifier'): LanguageAnalyzer._identifier = \ LanguageIdentifier.from_modelstring(model, norm_probs=True) return LanguageAnalyzer._identifier
def detect_language(html_content): identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) identifier.set_languages(SUMY_LANGUAGES.keys()) iso_lang, _ = identifier.classify(html_content) return iso_lang
def __init__(self): from langid.langid import LanguageIdentifier, model self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=False)
def __init__(self): self._identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
tempWechat = WechatBasic(conf=tempConf) confList = {'gh_ae02c9f6f14e': tempConf} global conversationStatusList conversationStatusList = {'WeChat': {}, 'Facebook': {}} global topTopics topTopics = {'WeChat': {'lang': 'en', 'topics': {}}, 'Facebook': {'lang': 'en', 'topics': {}}} global wechatList wechatList = {'gh_ae02c9f6f14e': tempWechat} global accountStatusList accountStatusList = {'WeChat': {'gh_ae02c9f6f14e': (True, False)}, 'Facebook': {'129262697152826': (True, False)}} # (on_help, on_kms_failure) languageCode_en = 'en-US' languageCode_zh = 'zh-CN' identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) token = 'EAAB1kFElgToBAHRJmoshPkpQzpEF2FviWyY9GdA5lUZBPwqRVb3tQdz9vlOkkLZBpp0nihxN5yyBJxDEZC3nTROBaosUYhiMWwwPcqUJiFEZA6lqQwcFHwfpWYZB8d7v5OsaZB2YDgLqRmpdNxvHy7s4pPiuPe8xK1MhFdgoRimgZDZD' messengerTokenList = {'276165652474701': token} if app.debug is not True: import logging from logging import Formatter from logging.handlers import RotatingFileHandler TEXT_MAX_PRINT_LENGTH = 25 LOG_FILENAME = 'wechat_test.log' handler = RotatingFileHandler(LOG_FILENAME, maxBytes=10000000, backupCount=10) formatter = Formatter('%(asctime)s - %(levelname)s - %(message)s')