Exemplo n.º 1
0
 def __init__(self, data_path, data_files: list, paper_set_file=None):
     self.data_files, self.data_path = data_files, data_path
     if paper_set_file is not None:
         self.paper_set = utils.get_paper_set(paper_set_file)
     identifier = LanguageIdentifier.from_modelstring(model,
                                                      norm_probs=True)
     self.lang = lambda s: identifier.classify(str(s))
def tag_lang(data, txt_var='text_clean'):
    """
    Tag language in all text in data, 
    return language, score and post IDs.
    
    :param data: data frame
    :param txt_var: text var
    :returns lang_id_data:: lang, score and post ID
    """
    lang_id_model = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    # parallel
    MAX_JOBS=5
    pandarallel.initialize(nb_workers=MAX_JOBS)
    lang_score_vals = data.loc[:, txt_var].parallel_apply(lang_id_model.classify)
    # serial
    # TODO: why does langid wreck CPU use?
#     lang_score_vals = data.loc[:, txt_var].apply(lang_id_model.classify)
    # separate lang/score
    lang_val, lang_score = zip(*lang_score_vals)
    lang_var = 'lang'
    lang_score_var = 'lang_score'
    post_id_var = 'id'
    data = data.assign(**{
        lang_var : lang_val,
        lang_score_var : lang_score,
    })
    lang_id_data = data.loc[:, [lang_var, lang_score_var, post_id_var]]
    return lang_id_data
Exemplo n.º 3
0
def langid_pred(inputText):
    # https://github.com/saffsd/langid.py
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    output = identifier.classify(inputText)    
    confidence = output[1]
    pred = output[0]
    return inputText, output, confidence, clean_output(pred)
Exemplo n.º 4
0
 def __init__(self):
     self.cli()
     self.infiles = self.get_files(self.indir, self.pattern)
     self.n_proceedings = 0
     self.identifier = LanguageIdentifier.from_modelstring(model,
                                                           norm_probs=True)
     self.main()
Exemplo n.º 5
0
def get_language(paragraphs):
    '''
    Fonction to get the sentences of a file text.
    Input: file text with one sentence per line.
    Output: list of the sentences
    '''
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    count_languages = {'fr': 0, 'en': 0, 'es': 0}
    total_count = 0
    for p in paragraphs:
        infos_language = language_detect(p, identifier)
        if infos_language[1] >= 0.7:
            count_languages[infos_language[0]] += float(infos_language[1])
            total_count += 1
        else:
            del p
    probability_max = 0
    initials = ''
    for k in count_languages.keys():
        if count_languages[k] / total_count > probability_max:
            probability_max = count_languages[k] / total_count
            initials = k
    logger.debug(
        f'Text in {initials} with {probability_max*100}% of confidence')
    return initials
Exemplo n.º 6
0
def are_words_valid(clean_words: List[str],
                    english_word_count: int,
                    remove_english: bool,
                    use_langid: bool) -> bool:
    """
    Determines whether a list of words is valid based on the provided parameters.
    :param clean_words: a list of clean word strings.
    :param english_word_count: the number of english words removed from the string during cleaning.
    :param remove_english: whether or not to remove english words.
    :param use_langid: whether or not to use the langid library to determine if a word is English.
    :return: True if utterance is valid, False otherwise.
    """
    # Exclude utterance if empty after cleaning
    cleaned_transcription = " ".join(clean_words).strip()
    if cleaned_transcription == "":
        return False

    # Exclude utterance if > 10% english
    if remove_english and len(clean_words) > 0 and english_word_count / len(clean_words) > 0.1:
        # print(round(english_word_count / len(clean_words)), trans, file=sys.stderr)
        return False

    # Exclude utterance if langid thinks its english
    if remove_english and use_langid:
        langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        lang, prob = langid_identifier.classify(cleaned_transcription)
        if lang == "en" and prob > 0.5:
            return False
    return True
Exemplo n.º 7
0
 def __init__(self, src_lang, tgt_lang, threshold=0.8):
     self.src_lang = src_lang
     self.tgt_lang = tgt_lang
     self.identifier = LanguageIdentifier.from_modelstring(m,
                                                           norm_probs=True)
     self.identifier.set_languages([src_lang, tgt_lang])
     self.threshold = threshold
Exemplo n.º 8
0
 def __init__(self):
     self.active = settings.ANALYZE_LANGUAGE
     if self.active:
         from langid.langid import LanguageIdentifier as lid, model
         self.identifier = lid.from_modelstring(model, norm_probs=True)
         langs = set(settings.LANGUAGES)
         langs = langs.intersection(self.identifier.nb_classes)
         self.identifier.set_languages(langs)
Exemplo n.º 9
0
def getLanguages(data):
    res = dict()
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    for filename, datum in data.items():
        lang, prob = identifier.classify(datum)
        res[filename] = pycountry.languages.get(
            alpha_2=lang)
    return res
def is_chinese2(content):
    if content:
        identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        result = identifier.classify(content)
        language, score = result
        if language =='zh' and score > 0.7:
            return True
        else:
            return False
Exemplo n.º 11
0
def transform_df(df):
    ''' dataframe '''
    print("transform_df called for partition...")
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)  #takes 2 seconds
    out = pd.DataFrame(columns=["text","sentiment"])
    preprocessor = Preprocessor()
    out["text"] = df["text"].apply(func=preprocessor.clean_text,args=(identifier,))
    out["sentiment"] = df["stars"].map(preprocessor.map_rating)
    return out
Exemplo n.º 12
0
def load_langid_model(model_path: Optional[str],
                      lang_set: Sequence[str]) -> LanguageIdentifier:
    """
    Loads the provided langid.py model. If none provided, then it loads the default model.
    :param model_path: path to model to load
    :param lang_set: language set to which the model should be restricted. Provide empty list for
        no restrictions.
    :return: language identifier
    """
    if model_path is None:
        from langid import langid
        langider = LanguageIdentifier.from_modelstring(langid.model,
                                                       norm_probs=True)
    else:
        langider = LanguageIdentifier.from_modelpath(model_path,
                                                     norm_probs=True)
    if len(lang_set) > 0:
        langider.set_languages(langs=lang_set)
    return langider
Exemplo n.º 13
0
 def __init__(self, src_lang=None, tgt_lang=None, src_threshold=0, tgt_threshold=0, **kwargs):
     if not (isinstance(src_lang, str) and isinstance(tgt_lang, str)):
         logging.error("Both source and target languages need to be defined")
         raise ValueError("Strings expected, got: %s %s" % (src_lang, tgt_lang))
     self.src_lang = src_lang
     self.tgt_lang = tgt_lang
     self.src_threshold = src_threshold
     self.tgt_threshold = tgt_threshold
     self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
     super().__init__(**kwargs)
Exemplo n.º 14
0
    def Languages(base_path, photo):
        photo_folder = os.path.join(base_path, photo)
        num_iterations = len([
            fol for fol in os.listdir(photo_folder)
            if os.path.isdir(os.path.join(photo_folder, fol))
            and "source" not in fol
        ])
        start_iter = 1
        range_iter = [str(i) for i in list(range(1, num_iterations + 1))]
        folder_base = os.path.join(base_path, photo, photo)

        print('INFO: Scraping Languages for photo {}'.format(photo))

        identifier = LanguageIdentifier.from_modelstring(model,
                                                         norm_probs=True)

        language_dict = {}
        for iteration in range_iter:

            if os.path.isdir(
                    os.path.join(folder_base + "_" + str(iteration) +
                                 "/txt")) == False:
                print('INFO: No texts in this iteration, stopping..')
                continue

            language_dict.update({str(iteration): {}})

            list_json = [
                js for js in os.listdir(
                    os.path.join(base_path, photo, photo + "_" +
                                 str(iteration), "txt")) if ".json" in js
            ]

            for js in list_json:
                with open(
                        os.path.join(base_path, photo,
                                     photo + "_" + str(iteration), "txt",
                                     js)) as f:
                    json_content = json.load(f)

                for id_, text in json_content.items():

                    language_score = identifier.classify(str(text))
                    print(language_score)
                    language_dict[str(iteration)].update(
                        {id_: [language_score[0], language_score[1]]})

        # Write Detected Languages to language.json
        print("INFO: Writing detected languages to {}".format(
            os.path.join(photo, 'languages.json')))
        with open(
                os.path.join(base_path, photo,
                             'languages-{}.json'.format(photo)), 'w') as fp:
            json.dump(language_dict, fp)
Exemplo n.º 15
0
 def identifier(self):
     cls = type(self)
     if not hasattr(cls, '_id'):
         # https://github.com/saffsd/langid.py
         from langid.langid import LanguageIdentifier as lid, model
         cls._id = lid.from_modelstring(model, norm_probs=True)
         if len(settings.LANGUAGES):
             langs = set(settings.LANGUAGES)
             langs = langs.intersection(cls._id.nb_classes)
             cls._id.set_languages(langs)
     return cls._id
Exemplo n.º 16
0
def convert_to_metafeatures(training_data):
    result = []
    lang_identifier = LanguageIdentifier.from_modelstring(model)
    for i, chunk in enumerate(training_data):
        langid_features = np.array([lang_identifier.instance2fv(tweet) for tweet in chunk])
        num_occurrences = langid_features.sum(axis=0)
        averages = [occurrences/len(chunk) for occurrences in num_occurrences]
        sparseness = (langid_features > 0).sum(axis=0)
        result.append(np.array(averages + sparseness))
        if i % 50 == 0: print('Converted {} inputs to meta-features...'.format(i))
    result = np.array(result)
    return result
Exemplo n.º 17
0
 def run_main(self):
     if not self.FETCH:
         print "Use option --fetch to fetch new twitter streams. Currently using tweet streams from folder tweet_archive/"
     self.solution_file = open(self.SOLUTION_FILE, "w+")
     self.identifier = LanguageIdentifier.from_modelstring(model,
                                                           norm_probs=True)
     if self.FETCH:
         print "Fetching new Twitter stream."
         self.fetch_stream_to_file(15000, self.GENERAL_TWEETS_FILE)
     self.process_tweets_in_file(self.GENERAL_TWEETS_FILE)
     self.local_tweet_analysis()
     self.additional_analysis()
Exemplo n.º 18
0
    def set_languages(self, langs):
        try:
            from langid.langid import LanguageIdentifier, model
        except ImportError:
            print('Please install package of langid')

        self.langid = LanguageIdentifier.from_modelstring(
            model, norm_probs=True)
        try:
            self.langid.set_languages(langs)
        except ValueError:
            self.langid.set_languages(['en'])
Exemplo n.º 19
0
 def extract_languages(self):
     from langid.langid import LanguageIdentifier, model
     identifier = LanguageIdentifier.from_modelstring(model,
                                                      norm_probs=True)
     count = 0
     for review in self.reviews:
         result = identifier.classify(review[1])
         count = count + 1
         print("#%d: lang & accuracy: %s, content: %s" %
               (count, result, review[1]))
         if self.save_to_db:
             self.parser.add_new_lang_feature(review[0], result[0],
                                              result[1])
Exemplo n.º 20
0
    def _get_p_language(self):
        """
        Returning and setting the language of the paste (guessing)

        :Example: PST._get_p_language()

        ..note:: The language returned is purely guessing and may not be accurate
        if the paste doesn't contain any human dictionnary words
        ..seealso: [email protected]:saffsd/langid.py.git

        """
        identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
        return identifier.classify(self.get_p_content())
Exemplo n.º 21
0
    def _get_p_language(self):
        """
        Returning and setting the language of the paste (guessing)

        :Example: PST._get_p_language()

        ..note:: The language returned is purely guessing and may not be accurate
        if the paste doesn't contain any human dictionnary words
        ..seealso: [email protected]:saffsd/langid.py.git

        """
        identifier = LanguageIdentifier.from_modelstring(model,
                                                         norm_probs=True)
        return identifier.classify(self.get_p_content())
 def __init__(
     self,
     language_scope: List = supported_languages_dict.keys(),
     minimum_score: float = 0.0,
     fallback_language: AnyStr = "",
 ):
     self.language_scope = language_scope
     self.minimum_score = float(minimum_score)
     self.fallback_language = fallback_language
     self.column_description_dict = self.COLUMN_DESCRIPTION_DICT  # may be changed by detect_languages_df
     self._langid_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
     self._langid_identifier.set_languages(
         [l for l in self.language_scope if l not in SUPPORTED_LANGUAGES_IN_CLD3_NOT_IN_LANGID]
     )
Exemplo n.º 23
0
 def __init__(
     self,
     language_scope: List = SUPPORTED_LANGUAGES_PYCLD3.keys(),
     minimum_score: float = 0.0,
     fallback_language: AnyStr = "",
 ):
     store_attr()
     self.column_descriptions = self.COLUMN_DESCRIPTIONS.copy(
     )  # may be changed by detect_languages_df
     self._langid_identifier = LanguageIdentifier.from_modelstring(
         model, norm_probs=True)
     self._langid_identifier.set_languages([
         l for l in self.language_scope
         if l not in SUPPORTED_LANGUAGES_PYCLD3_NOT_LANGID
     ])
Exemplo n.º 24
0
def removeUnicodeAndLangId(data):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    i = 0
    j = 0
    copy = {}
    for d in data:
        listTweet = []
        for tweet in data[d]:
            tweet = tweet.encode('ascii', 'ignore').decode("utf-8")
            lang = (identifier.classify(tweet))[0]
            if lang == "en":
                listTweet.append(tweet)

        if not len(listTweet) == 0:
            copy[d] = listTweet
def is_defective_pp(clean_pp):
    """
    Checks if a pp is defective, eliminate javascript and non-english pp - using nlp language detection
    :param clean_pp: clean pp
    :return: True if defective, otherwise False
    """
    low_text = clean_pp.lower()
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    language_detect = identifier.classify(clean_pp)
    if low_text is None or language_detect[0] != "en" or (language_detect[0] == "en" and language_detect[1] < 0.9) or \
            'privacy' not in low_text or 'class=' in low_text or 'function(' in low_text or \
            'function (' in low_text or 'catch(' in low_text or 'exception(' in low_text \
            or '{' in low_text:
        return True
    else:
        return False
Exemplo n.º 26
0
def set_english(df, text_column):
    '''
    set_english: takes in a string of a text column
    outputs the predicted languge of that string to a list
    which is used to mask a pandas DataFrame.
    Parameters
    ----------
    doc: Python string
    Returns
    -------
    doc: Python list with predicted language
    '''
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lst = []
    for i in df[text_column].values:
        lang, score = identifier.classify(i)
        lst.append(lang)
    return lst
Exemplo n.º 27
0
def detectar_lenguaje(texto, devolver_proba=False):
    """
    Identifica el lenguaje en el que está escrito el texto de entrada.

    :param texto: Texto de entrada.
    :type texto: str
    :param devolver_proba: Indica si se retorna el porcentaje de \
        confiabilidad del lenguaje identificado. Valor por \
        defecto `False`.
    :type devolver_proba: bool, opcional
    :return: (str) Texto del lenguaje identificado siguiendo el estandar \
        `ISO 639-1 <https://es.wikipedia.org/wiki/ISO_639-1>`_. \
        Si `devolver_proba = True` retorna una tupla.
    """
    identificador = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    if devolver_proba:
        return identificador.classify(texto)
    else:
        return identificador.classify(texto)[0]
Exemplo n.º 28
0
def gatherHashtag(data):
    data = json.load(data)
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    usrDict = {}
    numbHashtag={}
    numHashtag=0
    i=0
    j=0
    for key in data:
        

       
        try:
            usrDict[key['id']]=[]
            tmp = api.GetUserTimeline(user_id=key['id'], count=200)
            for t in tmp:
                hashtags=t.hashtags
                
                numHashtag+=len(hashtags)
                
                for h in hashtags:
                    
                    hashtag=h.text
                    hashtag=hashtag.encode('ascii', 'ignore').decode("utf-8")
                    lang=(identifier.classify(hashtag))[0]
                    if lang=="en":
                        usrDict[key['id']].append(hashtag)
                  
            meanHashTag=numHashtag/200
            numHashtag=0
            
            numbHashtag[key['id']]=meanHashTag
        except Exception as e:
            print(e)
            with open('logs/log.txt', 'a') as log:
                    log.write(str(e))
                    log.write(str(key['id']))
                    log.write("\n")
    
    with open('data/twitter_Hashtag.json', 'w+') as tweetsFile:
        json.dump(usrDict, tweetsFile)
    with open('data/numb_Hashtag.json', 'w+') as tweetsFile:
            json.dump(numbHashtag, tweetsFile)
Exemplo n.º 29
0
def get_country_by_language(text):
    """Get the country which speak the language of the given text.

    Args:
        text: the text.

    Returns:
        A list of country
    """
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    lang = identifier.classify(text)[0]

    countries_matching = []
    with open('./data/country.csv', 'rb') as csvfile:
        countries = csv.reader(csvfile, delimiter=';', quotechar='|')
        for country in countries:
            if lang in country[51].replace(' ', '').split(','):
                countries_matching.append(country[4].strip())

    logger.info(u'Matched text with countries: {0}'.format(countries_matching))
    return countries_matching
Exemplo n.º 30
0
    def detect(self, text):

        if not LanguageDetector.identifier:
            LanguageDetector.identifier = LanguageIdentifier.from_modelstring(
                model, norm_probs=True)

        language_abbrs = [
            lang
            for lang, prob in LanguageDetector.identifier.rank(text)
            if prob > self.DETECT_THRESHOLD_PROB
        ][:self.DETECT_THRESHOLD_LEN]

        if not language_abbrs:
            raise self.BrokenRequest(
                'UNSUPPORTED_LANGUAGE_DETECTED',
                data={'text': text},
                is_critical=True)

        return [
            self.languages_index[abbr]
            for abbr in language_abbrs
            if abbr in self.languages_index]
def clean_json_data(json_data: List[Dict[str, str]],
                    remove_english: bool = False,
                    use_langid: bool = False) -> List[Dict[str, str]]:
    """
    Clean a list of utterances (Python dictionaries) based on the given parameters.
    :param json_data: list of Python dictionaries, each must have a 'transcription' key-value.
    :param remove_english: whether or not to remove English from the utterances.
    :param use_langid: whether or not to use the langid library to identify English to remove.
    :return: cleaned list of utterances (list of dictionaries).
    """
    punctuation_to_remove = string.punctuation + "…’“–”‘°"
    special_cases = ["<silence>"]  # Any words you want to ignore
    langid_identifier = None

    if remove_english:
        english_words = get_english_words()  # pre-load English corpus
        if use_langid:
            langid_identifier = LanguageIdentifier.from_modelstring(
                model, norm_probs=True)
    else:
        english_words = set()

    cleaned_data = []
    for utterance in json_data:
        clean_words, english_word_count = clean_utterance(
            utterance=utterance,
            remove_english=remove_english,
            english_words=english_words,
            punctuation=punctuation_to_remove,
            special_cases=special_cases)

        if is_valid_utterance(clean_words, english_word_count, remove_english,
                              use_langid, langid_identifier):
            cleaned_transcript = " ".join(clean_words).strip()
            utterance["transcript"] = cleaned_transcript
            cleaned_data.append(utterance)

    return cleaned_data
Exemplo n.º 32
0
def compare_language_distribution(tweet_list):
    agreement_dic = {}
    disagreement_dic = {}
    guess_dic = {}
    pos_conf = []
    neg_conf = []
    clf = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    for tweet in tweet_list:
        language = tweet['lang']
        if language == 'und':
            continue
        text = str(tweet['text'])
        #print(text)
        text = " ".join(map(str, tw.tokenizeRawTweetText(text)))
        #print(text)
        guess = clf.classify(text)
        if guess[0] in guess_dic:
            guess_dic[guess[0]] += 1
        else:
            guess_dic[guess[0]] = 1

        if guess[0] == language:
            pos_conf.append(guess[1])
            if language in agreement_dic:
                agreement_dic[language] += 1
            else:
                agreement_dic[language] = 1
        else:
            neg_conf.append(guess[1])
            if language in disagreement_dic:
                disagreement_dic[language] += 1
            else:
                disagreement_dic[language] = 1
    print('The agreed inferences had a confidence of ' +
          str(sum(pos_conf) / len(pos_conf)) +
          '. The disagreed inferences had a confidence of ' +
          str(sum(neg_conf) / len(neg_conf)))
    return agreement_dic, disagreement_dic, guess_dic
def setup_pass_langid(model_path):
  global __identifier
  print "setting up an identifier"
  __identifier = LanguageIdentifier.from_modelpath(model_path)
Exemplo n.º 34
0
 def identifier(self):
     if not hasattr(LanguageAnalyzer, '_identifier'):
         LanguageAnalyzer._identifier = \
             LanguageIdentifier.from_modelstring(model, norm_probs=True)
     return LanguageAnalyzer._identifier
Exemplo n.º 35
0
def detect_language(html_content):
    identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
    identifier.set_languages(SUMY_LANGUAGES.keys())
    iso_lang, _ = identifier.classify(html_content)
    return iso_lang
Exemplo n.º 36
0
 def __init__(self):
     from langid.langid import LanguageIdentifier, model
     self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=False)
Exemplo n.º 37
0
 def __init__(self):
     self._identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
tempWechat = WechatBasic(conf=tempConf)
confList = {'gh_ae02c9f6f14e': tempConf}

global conversationStatusList
conversationStatusList = {'WeChat': {}, 'Facebook': {}}
global topTopics
topTopics = {'WeChat': {'lang': 'en', 'topics': {}}, 'Facebook': {'lang': 'en', 'topics': {}}}
global wechatList
wechatList = {'gh_ae02c9f6f14e': tempWechat}
global accountStatusList
accountStatusList = {'WeChat': {'gh_ae02c9f6f14e': (True, False)},
                     'Facebook': {'129262697152826': (True, False)}}  # (on_help, on_kms_failure)

languageCode_en = 'en-US'
languageCode_zh = 'zh-CN'
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

token = 'EAAB1kFElgToBAHRJmoshPkpQzpEF2FviWyY9GdA5lUZBPwqRVb3tQdz9vlOkkLZBpp0nihxN5yyBJxDEZC3nTROBaosUYhiMWwwPcqUJiFEZA6lqQwcFHwfpWYZB8d7v5OsaZB2YDgLqRmpdNxvHy7s4pPiuPe8xK1MhFdgoRimgZDZD'
messengerTokenList = {'276165652474701': token}

if app.debug is not True:
    import logging
    from logging import Formatter
    from logging.handlers import RotatingFileHandler

    TEXT_MAX_PRINT_LENGTH = 25
    LOG_FILENAME = 'wechat_test.log'

    handler = RotatingFileHandler(LOG_FILENAME, maxBytes=10000000, backupCount=10)

    formatter = Formatter('%(asctime)s - %(levelname)s - %(message)s')