def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ try: cld2_detect except NameError: raise ImportError( '`cld2-cffi` must be installed to use textacy\'s automatic language detection; ' 'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.' ) if compat.is_python2: is_reliable, _, best_guesses = cld2_detect(compat.unicode_to_bytes(text), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: LOGGER.warning( 'Text language detected with low confidence; best guesses: %s', best_guesses) return best_guesses[0][1]
def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ try: cld2_detect except NameError: raise ImportError( 'cld2-cffi is not installed, so language detection won\'t work; ' 'install it individually, or with textacy via `pip install textacy[lang]`' ) if is_python2: is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: msg = 'Text language detected with low confidence; best guesses: %s' logger.warning(msg, best_guesses) return best_guesses[0][1]
def is_good_snippet(snippet, len_range, min_text_frac, exclude_en): """ Args: snippet (str) len_range (Tuple[int, int]) min_text_frac (float) exclude_en (bool) Returns: bool """ len_snippet = len(snippet) if len_snippet < len_range[0] or len_snippet >= len_range[1]: return False # make sure snippet is *mostly* text len_text = sum(match.end() - match.start() for match in re_text.finditer(snippet)) if len_text / len_snippet < min_text_frac: return False # ugh, math and urls! if any(s in snippet for s in (r"\displaystyle", "http://", "https://")): return False # check for citations/references if any(re_pat.search(snippet) for re_pat in (re_doi, re_issn, re_isbn)): return False # filter out english copy-paste jobs if exclude_en is True: is_reliable, _, best_guesses = cld2_detect( snippet.encode("utf-8"), bestEffort=True ) if is_reliable is True and best_guesses[0][1] == "en": logging.debug( "found english-heavy snippet in non-english wiki text:\n%s", snippet ) return False return True
def is_english(text): try: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: pass lang = best_guesses[0][1] return lang == "en" except Exception as e: return False
def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ if PY2: is_reliable, _, best_guesses = cld2_detect(str(text).encode('utf8'), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(str(text), bestEffort=True) if is_reliable is False: msg = '**WARNING: Text language detected with low confidence; best guesses: {}' print(msg.format(best_guesses)) return best_guesses[0][1]
def detect_language(text): """ Detect the most likely language of a text and return its 2-letter code (see https://cloud.google.com/translate/v2/using_rest#language-params). Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package; to take advantage of optional params, call :func:`cld2.detect()` directly. Args: text (str) Returns: str """ if PY2: is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True) else: is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True) if is_reliable is False: msg = 'Text language detected with low confidence; best guesses: %s' logger.warning(msg, best_guesses) return best_guesses[0][1]