Exemplo n.º 1
0
 async def detect_language(
         self, text: str) -> Optional[Union[Language, List[Language]]]:
     async with ClientSession() as http:
         # 1) Try to detect entity using rasa nlu
         async for each_entity in self._detect_entities(text, http):
             if each_entity.entity in [
                     'language', 'country', 'country_flag'
             ]:
                 raw_language_obj = iso639.find(each_entity.value)
                 # False positive from rasa (maybe add some strings comparison later
                 if raw_language_obj and raw_language_obj[
                         'name'] != "Undetermined":
                     logging.info(
                         f"NLU model detected language: ({text})[{raw_language_obj['name']}]"
                     )
                     return raw_language_obj
             if each_entity.entity in ['country', 'country_flag']:
                 # Returned country name
                 # Our dict must have mapping to the country
                 langs = available_langs.get(each_entity.value)
                 if langs:
                     logging.info(
                         f"NLU model detected country: ({text})[{each_entity.entity}]"
                     )
                     langs = [iso639.find(lang) for lang in langs]
                     return [
                         lang_obj for lang_obj in langs
                         if lang_obj and lang_obj['name'] != "Undetermined"
                     ]
         else:
             # 2) Detect what is the language of the speaker
             language = await self.tr.detect_language(text, http)
             logging.info(
                 f"Translator detected language: ({text})[{language}]")
             # If user sent message in his own language and we figured out what is it - case closed
             language = iso639.find(language)
             if language and language['name'] != "Undetermined":
                 return language
Exemplo n.º 2
0
def bib2std(code):
    """
    Translate a bibliographic variant ISO 639-2 three letter code to its 
    corresponding ISO 639-1 code which can be compared with the output from
    the language detectors.
    """
    entry = iso639.find(iso639_2=code)
    if not entry:
        pass # print('**Failed to find ISO 639-2 code: %s' % code)
        # Just return original code without translating
        # This may be a discontinued code like scc for Serbian (instead of the
        # now standard srp) since these don't appear to be included in the package
        #code = None
    elif u'iso639_1' in entry:
        code = entry[u'iso639_1']
    else:
        code = entry[u'iso639_2_t']
    return code
Exemplo n.º 3
0
    async def entry(self, context: Context, user: User, db):
        user['context']['bq_state'] = 1
        # Special case for telegram-like client side language code entity
        if user['context'].get('language_state') is None and context['request']['user']['lang_code']:
            lang = iso639.find(context['request']['user']['lang_code'])
            if lang and lang['name'] != "Undetermined":
                # Update current context
                user['language'] = lang['iso639_1']
                self.set_language(lang['iso639_1'])

                # Ask if user wants to continue with the language
                context['request']['message']['text'] = self.strings["app_confirm_language"].format(lang['native'])
                context['request']['has_buttons'] = True
                context['request']['buttons_type'] = "text"
                context['request']['buttons'] = [
                    {"text": self.strings['yes']},
                    {"text": self.strings['no']},
                    {"text": self.strings['stop']}
                ]

                user['context']['language_state'] = 4
                self.send(user, context)
                # [DEBUG]
                # logging.info(f"{user['context']['language_state']}, {user['language']}")
                return base_state.OK
            # [DEBUG]
            # logging.info(f"{lang}, {context['request']['user']['lang_code']}")

        # Send language message
        context['request']['message']['text'] = self.strings["choose_lang"]
        # Add confirmation button to skip
        if context['request']['user']['lang_code']:
            user['context']['lang_code'] = context['request']['user']['lang_code']
            context['request']['has_buttons'] = True
            context['request']['buttons_type'] = "text"
            context['request']['buttons'] = [
                {"text": self.strings['skip_lang'].format(context['request']['user']['lang_code'])}
            ]
        # Set user context as 'Was Asked Language Question'
        user['context']['language_state'] = 1
        # Don't forget to add task
        self.send(user, context)
        return base_state.OK
Exemplo n.º 4
0
def turn_into_dictionary(input_data: List[str]) -> dict:
    """
    Transform a list with fic data into a dictionary.
    """
    if not isinstance(input_data, list):
        raise TypeError(f"'{type(input_data)}' cannot be used here")
    result_dictionary = {}
    for index, data in enumerate(input_data):
        if ":" in data:
            temp_values = [x.strip() for x in data.split(": ")]
            key = temp_values[0]
            if match(r"^\d+(,\d+)*$", temp_values[1]):
                temp_values[1] = sub(",", "", temp_values[1])
                val = int(temp_values[1])
            else:
                val = temp_values[1]
        else:
            if data == "OC":
                key = "Characters"
                val = data
            elif data == "Complete":
                key = "Status"
                val = data
            else:
                lang = iso639.find(language=data)
                if lang:
                    key = "Language"
                    val = lang["name"]
                else:
                    key = "Characters"
                    val = [x.strip() for x in data.split(",")]
                    for x in data.split("/"):
                        if x in GENRES:
                            key = "Genres"
                            val = data.split("/")
                            break
        result_dictionary[key] = val

    return result_dictionary
 if key in keep.keys():
     country_code   = line['region']
     language_code = line['language']
     original = line['isOriginalTitle']            
     if (country_code == '\\N'):
        country_code = ''                    
     else:
         try:
             country[country_code] = iso3166.countries.get(country_code).name
         except:
             country[country_code] = "Not an ISO-3166 country?"    # tgg22 : I was not able to fix these                                 
     if (language_code == '\\N'):
          language_code = ''
     else:
         try:
             language[language_code] = iso639.find(language_code)['name']
         except:
             language[language_code] = fix_iso639[language_code] # "Not an ISO-639 language?"
     if (original == '\\N'):
         original = '' 
     else:
         if (original == "0"):
             original = "false"
         else:
             original = "true"
     if key in rowkey.keys():
         rowkey[key] = rowkey[key] + 1
     else:
         rowkey[key] = 0 
     print(bar.join([key, str(rowkey[key]), line['title'], country_code, language_code, original]), file=has_alt_out)
         
Exemplo n.º 6
0
 def is_valid_language(item: str) -> bool:
     return iso639.find(whatever=item) is not None
Exemplo n.º 7
0
for entry in data['entries']:
    # Read the article
    article = Article(entry.link)
    article.download()
    article.parse()
    document = article.text
    words = word_tokenize(document)

    print "reading article", article.title
    # Remove punctuation
    words = [word.lower() for word in words if word.isalpha()]

    # Get language
    try:
        language = iso639.find(detect(entry.title))['name'].lower()
        stemmer = SnowballStemmer(language)
    except Exception, e:
        print str(e)
        stemmer = SnowballStemmer("english")
        print "stem language is set to english"

    # Stem text
    for word in words:
        word = stemmer.stem(word)

    document = ' '.join(words)

    for keyword in keywords:
        relevance = Document.get_relevance(document, stemmer.stem(keyword))
        if relevance > 0: