Exemplo n.º 1
0
def get_language(entry):
    """ hacky language detection for domains. Downloads the index page, and runs
    cld2 on the html """
    index_url = entry.url.replace("robots.txt", "")

    # hack around some issues here,
    if entry.domain in KNOWN_LANGUAGES:
        language = KNOWN_LANGUAGES.get(entry.domain)

    else:
        try:
            page = requests.get(index_url)
            try:
                languages = cld2.detect(page.content, isPlainText=False,
                                        hintTopLevelDomain=entry.domain.split('.')[-1])
            except:
                languages = cld2.detect(page.text.encode("utf8"), isPlainText=False,
                                        hintTopLevelDomain=entry.domain.split('.')[-1])

            # ignoring 'is_reliable' flag here, set on baidu.com etc (even though detects
            # language appropiately
            language = languages.details[0].language_name if languages.details else 'Unknown'
            index_url = page.url

        except Exception as e:
            log.exception("Failed to analyze language for '%s'", entry.domain)
            language = 'Failed'

    language = language.title()
    # traditional chinese -> chinese
    if language == 'Chineset':
        language = 'Chinese'
    return language, not urlparse(index_url).netloc.endswith(entry.domain)
Exemplo n.º 2
0
def do_resegment(args):
    docid, text_pruned = args
    lang = cld2.detect(text_pruned, want_chunks=True)
    segmented = [ { "l": c[0].code,
                    "t": list(word_seg.segment(c[0].code, c[1])) }
                  for c in lang.chunks ]
    return (docid, json.dumps(segmented))
 def parse(self, response):
     now = datetime.utcnow().replace(microsecond=0).isoformat()
     url = response.url
     datetimeToday = now + 'Z'
     contentElement = response.css('.bpa-richtext').get()
     contentElement = response.css(
         '#main').get() if contentElement is None else contentElement
     converter = html2text.HTML2Text()
     converter.ignore_links = True
     updatedDate = response.css('.bpa-time > time::attr(datetime)').get()
     title = response.css('title::text').get().split(' | ')[-1]
     text = converter.handle(contentElement)
     isReliable, textBytesFound, details = cld2.detect(text)
     language = details[0].language_name
     yield {
         'title': title,
         'source': 'Presse- und Informationsamt der Bundesregierung',
         'published': updatedDate,
         'url': url,
         'scraped': datetimeToday,
         'classes': ['Government'],
         'country': 'Germany',
         'municipality': 'National',
         'language': language,
         'text': text
     }
Exemplo n.º 4
0
 def parse(self, response):
     now = datetime.utcnow().replace(microsecond=0).isoformat()
     url = response.url
     datetimeToday = now + 'Z'
     textContent = 'todo'
     dateElement = response.css('time::text').get()
     dateElementText = dateElement.replace('\t', '').replace(
         '\n', '').replace('                                 ',
                           '').replace('                 ', '')
     dateElementArray = dateElementText.split(',')
     updatedDateISO = dateparser.parse(dateElementArray[0],
                                       languages=['en']).date()
     updatedDateTime = str(updatedDateISO)
     title = response.css('h1::text').get()
     contentArray = response.css('p::text').extract()
     converter = html2text.HTML2Text()
     converter.ignore_links = True
     text = reduce(
         lambda first, second: converter.handle(first) + converter.handle(
             second), contentArray)
     isReliable, textBytesFound, details = cld2.detect(text)
     textMinusUnnecessaryChars = text.replace('\\', '')
     language = details[0].language_name
     yield {
         'title': title,
         'source': 'Georgia State Government',
         'published': updatedDateTime,
         'url': url,
         'scraped': datetimeToday,
         'classes': ['Government'],
         'country': 'United States of America',
         'municipality': 'Georgia',
         'language': language,
         'text': textMinusUnnecessaryChars
     }
def main():
    for fn in sys.argv[1:]:
        with open(fn) as f:
            guesses = cld2.detect(f.read())
            sys.stdout.write("{}: {}\n"
                             .format(fn, " / ".join(dumplang(g)
                                                    for g in guesses)))
Exemplo n.º 6
0
def detect_language(text):
    # details is 3x (langName, langCode, percent, score)
    lang_is_reliable, _, lang_details = cld2.detect(text)
    lang_details = lang_details[0]  # take only the first lang detected
    lang_name, lang_code, lang_percent, lang_score = lang_details

    return lang_name, lang_code, lang_score, lang_is_reliable
def do_content_extraction(args):
    docid, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    extr = html_extractor.ExtractedContent(baseurl, page)
    lang = cld2.detect(extr.text_pruned, want_chunks=True)

    segmented = [ { "l": c[0].code,
                    "t": list(word_seg.segment(c[0].code, c[1])) }
                  for c in lang.chunks ]

    pagelen = len(page)
    content = extr.text_content.encode("utf-8")
    chash   = hashlib.sha256(content).digest()
    pruned  = extr.text_pruned.encode("utf-8")
    phash   = hashlib.sha256(pruned).digest()
    segmtd  = json.dumps(segmented).encode("utf-8")
    heads   = json.dumps(extr.headings).encode("utf-8")
    hhash   = hashlib.sha256(heads).digest()
    links   = json.dumps(extr.links).encode("utf-8")
    lhash   = hashlib.sha256(links).digest()
    rsrcs   = json.dumps(extr.resources).encode("utf-8")
    rhash   = hashlib.sha256(rsrcs).digest()
    domst   = json.dumps(extr.dom_stats.to_json()).encode("utf-8")
    dhash   = hashlib.sha256(domst).digest()

    return (docid, pagelen,
            chash, content,
            phash, pruned, segmtd,
            hhash, heads,
            lhash, links,
            rhash, rsrcs,
            dhash, domst)
def do_content_extraction(args):
    page, url, locale, sources, access_time, result, detail, ourl, rurl = args
    page = zlib.decompress(page)
    pagelen = len(page)
    pagehash = hashlib.sha256(page).digest()
    extr = html_extractor.ExtractedContent(url, page)
    langs = cld2.detect(extr.text_pruned)
    return (zlib.compress(extr.text_pruned.encode("utf-8")),
            zlib.compress(extr.text_content.encode("utf-8")),
            zlib.compress(json.dumps(extr.headings).encode("utf-8")),
            zlib.compress(json.dumps(extr.links).encode("utf-8")),
            zlib.compress(json.dumps(extr.resources).encode("utf-8")),
            zlib.compress(json.dumps(extr.dom_stats.to_json())
                          .encode("utf-8")),
            langs[0].code,
            langs[0].percent,
            locale,
            sources,
            access_time,
            result,
            detail,
            ourl,
            rurl,
            pagelen,
            pagehash)
def check_message(message_content, pattern, set_lang="en"):
    lang_details = cld2.detect(message_content)
    lang = (lang_details[2][0][1])
    if lang == set_lang or lang == "un":
        if re.search(pattern, message_content):
            return lang
        else:
            return None
Exemplo n.º 10
0
def detect_language(text, proba_threshold):
    _, _, details = cld2.detect(text)

    language_code = details[0].language_code
    probability = details[0].percent

    if language_code != 'un' and probability > proba_threshold:
        return language_code
Exemplo n.º 11
0
def do_segmentation(args):
    id, text = args
    lang = cld2.detect(text, want_chunks=True)
    segmented = [{
        "l": c[0].code,
        "t": list(word_seg.segment(c[0].code, c[1]))
    } for c in lang.chunks]
    return id, quote_utf8_as_text(json.dumps(segmented).encode("utf-8"))
Exemplo n.º 12
0
def is_english(document):
    '''checks if document is in English
    '''

    import cld2

    reliable, _, details = cld2.detect(document, bestEffort=True)
    return reliable and details[0][0] == 'ENGLISH' and details[0][2] >= MINENGPER
Exemplo n.º 13
0
def detect_language(text):
    """Detect language using CLD2 library."""
    try:  # todo: figure out what's causing an occasional error
        _, _, details = cld2.detect(text)
        lang = details[0].language_code
    except ValueError:
        lang = 'un'
    return lang if lang != 'un' else None
Exemplo n.º 14
0
def cld2_(text, label):
    try:
        result = cld2.detect(text.strip())
        if result[2][0].language_code == label:
            return True
    except Exception as e:
        print(e)
        pass
    return False
Exemplo n.º 15
0
def do_redetect(args):
    id, text = args
    try:
        text = zlib.decompress(text)
    except:
        text = ''
    langs = cld2.detect(text)

    return (id, json.dumps([{"l":l.code, "s":l.score} for l in langs.scores]))
Exemplo n.º 16
0
 def guess_lang_from_data(data, is_html, default_lang='en'):
     data = TextSanitizer.clean_utf8(data)  # cld2 needs clean input
     reliable, text_bytes, detected_languages = cld2.detect(
         data.encode('utf-8', 'ignore'), isPlainText=(not is_html),
         useFullLangTables=True, bestEffort=True)
     if not reliable:
         return default_lang
     else:
         return detected_languages[0][1]
Exemplo n.º 17
0
def fortnite_entries(file, entries, lang, limit):
    with open(file, 'r') as tsv_file:
        tsv_reader = csv.reader(tsv_file, delimiter='\t')
        for line in tsv_reader:
            lang_details = cld2.detect(line[2])
            if lang_details[2][0][1] == lang:
                entries.append(Entries.EntryFortnite(line[0], line[1], line[2], line[3], lang_details[2][0][1]))
            if len(entries) > limit:
                break
Exemplo n.º 18
0
 def guess_lang_from_data(data, is_html, default_lang='en'):
     assert isinstance(data, unicode)
     data = TextSanitizer.clean_utf8(data)  # cld2 needs clean input
     reliable, text_bytes, detected_languages = cld2.detect(
         data.encode('utf-8', 'ignore'), isPlainText=(not is_html),
         useFullLangTables=True, bestEffort=True)
     if not reliable:
         return default_lang
     else:
         return detected_languages[0][1]
Exemplo n.º 19
0
def gen_sentences(fileobj):
  for line in fileobj:
    line = line.strip()
    if len(line.split(' ')) < 5 and len(line.encode('utf-8')) < 20:
      print >> sys.stderr, "skipping too-short line"
      continue
    if cld2.detect(line.encode('utf-8'))[2][0][0] == 'ENGLISH':
      print >> sys.stderr, "skipping English line"
      continue
    for sentence in tokenizer.tokenize(line):
      yield sentence
Exemplo n.º 20
0
    def predict(self, text):
        """
        Predicting the language of a text.

        Parameters
        ----------
        text : str
        """
        is_reliable, text_bytes_found, details = cld2.detect(text,
                                                             bestEffort=True)
        return self.map2wili(details[0].language_code)
Exemplo n.º 21
0
def lang_detect(text):
    printable_str = ''.join(x for x in text if x.isprintable())
    isReliable, textBytesFound, details = cld2.detect(printable_str)
    language_code = details[0].language_code
    if language_code != 'en' and language_code != 'ru':
        res = 0
    elif language_code == 'ru':
        res = 2
    elif language_code == 'en':
        res = 1
    return res
Exemplo n.º 22
0
def _guess_lang_from_data(data, is_html, default_lang='en'):
    assert isinstance(data, unicode)
    data = TextSanitizer.clean_utf8(data)
    # data = data.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
    # print "broken", data.encode("utf-8")[17929 - 17933]

    reliable, text_bytes, detected_languages = cld2.detect(
        data.encode('utf-8', 'ignore'), isPlainText=(not is_html),
        useFullLangTables=True, bestEffort=True)
    if not reliable:
        return default_lang
    else:
        return detected_languages[0][1]
def getlanguageSpans(documentText):
    # Mimics what JANUS will do when detecting languages
    # Provides a span and it's language

    spanList = []

    isReliable, textBytesFound, details, langVector = cld2.detect(
        documentText, returnVectors=True)
    for offset, num_bytes, lang_name, lang_code in langVector:
        textSpan = documentText[offset:offset + num_bytes]
        spanList.append((lang_code, textSpan))

    return spanList
Exemplo n.º 24
0
def check_language(text, languages):
    """Checks if text is written in any of the specified languages."""
    import cld2
    try:
        _, _, lang = cld2.detect(text)
        return lang[0].language_code in languages
    except Exception as cld_ex:  # noqa
        # cld2 cannot handle some UTF-8 characters that Python can. See
        # https://github.com/mikemccand/chromium-compact-language-detector/issues/22
        # There is a workaround, but I'd rather just call langid in this case
        import langid
        lang, _ = langid.classify(text)
        return lang in languages
Exemplo n.º 25
0
def get_language(entry):
    """ hacky language detection for domains. Downloads the index page, and runs
    cld2 on the html """
    index_url = entry.url.replace("robots.txt", "")

    # hack around some issues here,
    if entry.domain in KNOWN_LANGUAGES:
        language = KNOWN_LANGUAGES.get(entry.domain)

    else:
        try:
            page = requests.get(index_url)
            try:
                languages = cld2.detect(
                    page.content,
                    isPlainText=False,
                    hintTopLevelDomain=entry.domain.split('.')[-1])
            except:
                languages = cld2.detect(
                    page.text.encode("utf8"),
                    isPlainText=False,
                    hintTopLevelDomain=entry.domain.split('.')[-1])

            # ignoring 'is_reliable' flag here, set on baidu.com etc (even though detects
            # language appropiately
            language = languages.details[
                0].language_name if languages.details else 'Unknown'
            index_url = page.url

        except Exception as e:
            log.exception("Failed to analyze language for '%s'", entry.domain)
            language = 'Failed'

    language = language.title()
    # traditional chinese -> chinese
    if language == 'Chineset':
        language = 'Chinese'
    return language, not urlparse(index_url).netloc.endswith(entry.domain)
Exemplo n.º 26
0
def main():
    wrapper = textwrap.TextWrapper(initial_indent="    ",
                                   subsequent_indent="    ")
    for fn in sys.argv[1:]:
        with open(fn) as f:
            result = cld2.detect(f.read(), want_chunks=True)
            sys.stdout.write("{}: {}\n"
                             .format(fn, " / ".join(dumplang(g)
                                                    for g in result.scores)))
            for i, chunk in enumerate(result.chunks):
                sys.stdout.write("  Chunk {}: {}={}\n"
                                 .format(i+1, chunk[0].code, chunk[0].name))
                sys.stdout.write(wrapper.fill(chunk[1][:2000]))
                sys.stdout.write("\n\n")
Exemplo n.º 27
0
def _guess_lang_from_data(data, is_html, default_lang='en'):
    assert isinstance(data, unicode)
    data = TextSanitizer.clean_utf8(data)
    # data = data.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
    # print "broken", data.encode("utf-8")[17929 - 17933]

    reliable, text_bytes, detected_languages = cld2.detect(
        data.encode('utf-8', 'ignore'),
        isPlainText=(not is_html),
        useFullLangTables=True,
        bestEffort=True)
    if not reliable:
        return default_lang
    else:
        return detected_languages[0][1]
Exemplo n.º 28
0
def do_content_extraction(args):
    page, url, locale, sources, access_time, result, detail, ourl, rurl = args
    page = zlib.decompress(page)
    pagelen = len(page)
    pagehash = hashlib.sha256(page).digest()
    extr = html_extractor.ExtractedContent(url, page)
    langs = cld2.detect(extr.text_pruned)
    return (zlib.compress(extr.text_pruned.encode("utf-8")),
            zlib.compress(extr.text_content.encode("utf-8")),
            zlib.compress(json.dumps(extr.headings).encode("utf-8")),
            zlib.compress(json.dumps(extr.links).encode("utf-8")),
            zlib.compress(json.dumps(extr.resources).encode("utf-8")),
            zlib.compress(
                json.dumps(extr.dom_stats.to_json()).encode("utf-8")),
            langs[0].code, langs[0].percent, locale, sources, access_time,
            result, detail, ourl, rurl, pagelen, pagehash)
Exemplo n.º 29
0
def detect(text):
    text = illegalChars.sub('', text)

    text = text.encode('utf-8')
    try:
        isReliable, textBytesFound, details = cld2.detect(text)
    except:
        logging.exception('Error processing text: %r', text)
        return None

    if isReliable:
        # top language, get the language code:
        # details: (('ENGLISH', 'en', 95, 1736.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
        return details[0][1]
    else:
        return None
Exemplo n.º 30
0
    def language(self):
        """
        Detect the language of a text if no language was provided along with the text

        >>> doc = Doc('Test sentence for testing text', language='en')
        >>> doc.language
        'en'
        >>> doc = Doc('Test sentence for testing text')
        >>> doc.language
        'en'
        """
        if not self._language:
            _, _, best_guesses = cld2.detect(self.clean_text, hintLanguage=self._hint_language,
                                             bestEffort=True)
            self._language = best_guesses[0][1]
        return self._language
Exemplo n.º 31
0
def translate_line_or_not(line, src_lang, translator):  # -> (translation, tag)
    if (not line or
            line.startswith('<') and line.endswith('>')):
        return line, '<>'

    try:
        isReliable, _, lang_details = cld2.detect(line)
    except ValueError:
        return line, 'ValueError'

    if not isReliable or lang_details[0][1].decode() != src_lang:
        return line, lang_details[0][1].decode()

    trans_line = translator(line)

    return trans_line, src_lang
Exemplo n.º 32
0
def detect(text):
    text = illegalChars.sub('', text)

    text = text.encode('utf-8')
    try:
        isReliable, textBytesFound, details = cld2.detect(text)
    except:
        logging.exception('Error processing text: %r', text)
        return None

    if isReliable:
        # top language, get the language code:
        # details: (('ENGLISH', 'en', 95, 1736.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
        return details[0][1]
    else:
        return None
def cld2_(text, label):
    score = 0
    try:
        result = cld2.detect(text.strip())
        if result[2][0].language_code in label:
            score += 1
        if result[2][1].language_code in label:
            score += 1
        if result[2][0].language_code == label[0]:
            score += 2
        if result[2][0].language_code == label[0] and result[2][
                1].language_code == label[1]:
            score += 3
    except Exception as e:
        print(e)
        pass
    return score
Exemplo n.º 34
0
def detect_tweet_lang(text):
    try:
        #Remove mentions and URLs before trying to detect language
        text = RE_MENTION.sub(" ", text)
        text = RE_URL.sub(" ", text)
        vals = {}

        text = text.encode("UTF-8")
        vals["cld_reliable"], vals["cld_bytes"], details = cld2.detect(text)
        if len(details) > 1:
            vals["cld_lang1"] = details[0][1]
            vals["cld_lang1_percent"] = details[0][2]
        if len(details) > 2:
            vals["cld_lang2"] = details[1][1]
            vals["cld_lang2_percent"] = details[1][2]
        return vals
    except Exception as e2:
        logging.debug("CLD error: " + str(e2) + "\n" + text)
Exemplo n.º 35
0
 def detect_from_document(self, document, html=None):
     return cld2.detect(
         html or document.source_data,
         isPlainText=False,
         useFullLangTables=True,
         hintTopLevelDomain=document.get_url().
         suffix,  # TODO: doesn't seem to have any influence?
         hintLanguage=None,
         hintLanguageHTTPHeaders=None,  # TODO from headers
         hintEncoding=None,  # TODO from headers
         returnVectors=False,
         debugScoreAsQuads=False,
         debugHTML=False,
         bestEffort=True,
         debugCR=False,
         debugVerbose=False,
         debugQuiet=True,
         debugEcho=False)
Exemplo n.º 36
0
 def detect_from_document(self, document, html=None):
     return cld2.detect(
         html or document.source_data,
         isPlainText=False,
         useFullLangTables=True,
         hintTopLevelDomain=document.get_url().suffix,  # TODO: doesn't seem to have any influence?
         hintLanguage=None,
         hintLanguageHTTPHeaders=None,  # TODO from headers
         hintEncoding=None,  # TODO from headers
         returnVectors=False,
         debugScoreAsQuads=False,
         debugHTML=False,
         bestEffort=True,
         debugCR=False,
         debugVerbose=False,
         debugQuiet=True,
         debugEcho=False
     )
def do_content_extraction(args):
    origin, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    pagelen = len(page)
    extr = html_extractor.ExtractedContent(baseurl, page)
    langs = cld2.detect(extr.text_pruned)

    pcontent = zlib.compress(extr.text_pruned.encode("utf-8"))
    phash = hashlib.sha256(pcontent).digest()
    headings = zlib.compress(json.dumps(extr.headings).encode("utf-8"))
    links = zlib.compress(json.dumps(extr.links).encode("utf-8"))
    resources = zlib.compress(json.dumps(extr.resources).encode("utf-8"))
    domstats = zlib.compress(json.dumps(extr.dom_stats.to_json()).encode("utf-8"))

    return (origin, pagelen, phash, langs[0].code, langs[0].percent,
            pcontent, links, resources, headings, domstats)
Exemplo n.º 38
0
def do_content_extraction(args):
    origin, page, baseurl = args
    try:
        page = zlib.decompress(page)
    except:
        page = ''
    pagelen = len(page)
    extr = html_extractor.ExtractedContent(baseurl, page)
    langs = cld2.detect(extr.text_pruned)

    pcontent = zlib.compress(extr.text_pruned.encode("utf-8"))
    phash = hashlib.sha256(pcontent).digest()
    headings = zlib.compress(json.dumps(extr.headings).encode("utf-8"))
    links = zlib.compress(json.dumps(extr.links).encode("utf-8"))
    resources = zlib.compress(json.dumps(extr.resources).encode("utf-8"))
    domstats = zlib.compress(
        json.dumps(extr.dom_stats.to_json()).encode("utf-8"))

    return (origin, pagelen, phash, langs[0].code, langs[0].percent, pcontent,
            links, resources, headings, domstats)
Exemplo n.º 39
0
 def parse(self, response):
     now = datetime.utcnow().replace(microsecond=0).isoformat()
     url = response.url
     datetimeToday = now + 'Z'
     textContent = 'todo'
     dateElement = response.css('time::text').get()
     dateElementText = dateElement.replace('\t', '').replace(
         '\n', '').replace('                                 ',
                           '').replace('                 ', '')
     dateElementArray = dateElementText.split(',')
     updatedDateISO = dateparser.parse(dateElementArray[0],
                                       languages=['en']).date()
     updatedDateTime = str(updatedDateISO)
     tempTitle = response.css('.content-title.text-center::text').get()
     if (tempTitle == '\n  '):
         title = response.css(".field--item::text").get()
     else:
         title = tempTitle
     contentArray = response.css('p::text').extract()
     converter = html2text.HTML2Text()
     converter.ignore_links = True
     text = reduce(
         lambda first, second: converter.handle(first) + converter.handle(
             second), contentArray)
     isReliable, textBytesFound, details = cld2.detect(text)
     textMinusUnnecessaryChars = text.replace(
         "Federal government websites often end in .gov or .mil. Before sharing\nsensitive information, make sure you're on a federal government site. The\nensures that you are connecting to the official website and that any\ninformation you provide is encrypted and transmitted securely.",
         "").replace('\\', '')
     language = details[0].language_name
     yield {
         'title': title,
         'source': 'Federal Dept. Of Agriculture',
         'published': updatedDateISO,
         'url': url,
         'scraped': datetimeToday,
         'classes': ['Government'],
         'country': 'United States of America',
         'municipality': 'Federal Government',
         'language': language,
         'text': textMinusUnnecessaryChars
     }
Exemplo n.º 40
0
    def should_be_good_language(self, lang_interface="fil"):
        lang_interface = MainPageLocators.LANGUAGE_BUTTONS_DICT[
            lang_interface][1]
        text_on_page = ""
        for i in range(1, 6):
            text_on_page = text_on_page + WebDriverWait(self.browser, 5).until(
                EC.presence_of_element_located(
                    MainPageLocators.HEADER_DICT[i])).text + " "
        title = WebDriverWait(self.browser, 5).until(
            EC.visibility_of_element_located(MainPageLocators.TITLE_TEXT)).text
        text_on_page += title
        for i in range(1, 9):
            text_on_page = text_on_page + " " + WebDriverWait(
                self.browser, 5).until(
                    EC.presence_of_element_located(
                        MainPageLocators.COLUMNS_DICT[i])).text

        isReliable, textBytesFound, lang_lit = cld2.detect(text_on_page)
        detect_lang = lang_lit[0][1]

        assert detect_lang == lang_interface, f"The title was not changed, detected {detect_lang}, used {lang_interface}"
Exemplo n.º 41
0
def cld2_detector(text):
    """
    cld2 library functionality implementation for the language detection API
    this library is used for multiple language detection
    Args:
        text: (string) text value sent for language detection

    Returns: (dict) detected languages with its confidence values or (boolean) if error returns false

    """
    try:
        text = ''.join(x for x in text if x in string.printable)
        result = cld2.detect(text.strip())
        if result[2][1].language_code != "un":
            return [(result[2][0].language_code, result[2][0].percent),
                    (result[2][1].language_code, result[2][1].percent)]
        else:
            return [(result[2][0].language_code, result[2][0].percent)]

    except Exception as e:
        return str(e)
Exemplo n.º 42
0
def language_code_for_text(text: str):
    """Returns an ISO 690 language code for the plain text passed as a parameter.

    :param text: Text that should be identified
    :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure
    """
    text = decode_object_from_bytes_if_needed(text)

    if not text:
        return ''

    if len(text) > __MAX_TEXT_LENGTH:
        log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH)
        text = text[:__MAX_TEXT_LENGTH]

    # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8
    text = __recode_utf8_string(text)

    try:
        is_reliable, text_bytes_found, details = cld2.detect(
            utf8Bytes=text, useFullLangTables=True)
    except Exception as ex:
        log.error("Error while detecting language: %s" % str(ex))
        return ''

    if not details:
        return ''

    best_match = details[0]
    language_name = best_match.language_name.lower()
    language_code = best_match.language_code.lower()

    if language_name in {'unknown', 'tg_unknown_language'
                         } or language_code == 'un':
        return ''

    if not language_is_supported(language_code):
        return ''

    return language_code
Exemplo n.º 43
0
def language_code_for_text(text: str):
    """Returns an ISO 690 language code for the plain text passed as a parameter.

    :param text: Text that should be identified
    :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure
    """
    text = decode_object_from_bytes_if_needed(text)

    if not text:
        return ''

    if len(text) > __MAX_TEXT_LENGTH:
        log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH)
        text = text[:__MAX_TEXT_LENGTH]

    # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8
    text = __recode_utf8_string(text)

    try:
        is_reliable, text_bytes_found, details = cld2.detect(utf8Bytes=text, useFullLangTables=True)
    except Exception as ex:
        log.error("Error while detecting language: %s" % str(ex))
        return ''

    if not details:
        return ''

    best_match = details[0]
    language_name = best_match.language_name.lower()
    language_code = best_match.language_code.lower()

    if language_name in {'unknown', 'tg_unknown_language'} or language_code == 'un':
        return ''

    if not language_is_supported(language_code):
        return ''

    return language_code
Exemplo n.º 44
0
def is_slovene(content):
	a, b, languages = cld2.detect(content)
	return 'SLOVENIAN' in [x[0] for x in languages]