def get_language(entry): """ hacky language detection for domains. Downloads the index page, and runs cld2 on the html """ index_url = entry.url.replace("robots.txt", "") # hack around some issues here, if entry.domain in KNOWN_LANGUAGES: language = KNOWN_LANGUAGES.get(entry.domain) else: try: page = requests.get(index_url) try: languages = cld2.detect(page.content, isPlainText=False, hintTopLevelDomain=entry.domain.split('.')[-1]) except: languages = cld2.detect(page.text.encode("utf8"), isPlainText=False, hintTopLevelDomain=entry.domain.split('.')[-1]) # ignoring 'is_reliable' flag here, set on baidu.com etc (even though detects # language appropiately language = languages.details[0].language_name if languages.details else 'Unknown' index_url = page.url except Exception as e: log.exception("Failed to analyze language for '%s'", entry.domain) language = 'Failed' language = language.title() # traditional chinese -> chinese if language == 'Chineset': language = 'Chinese' return language, not urlparse(index_url).netloc.endswith(entry.domain)
def do_resegment(args): docid, text_pruned = args lang = cld2.detect(text_pruned, want_chunks=True) segmented = [ { "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks ] return (docid, json.dumps(segmented))
def parse(self, response): now = datetime.utcnow().replace(microsecond=0).isoformat() url = response.url datetimeToday = now + 'Z' contentElement = response.css('.bpa-richtext').get() contentElement = response.css( '#main').get() if contentElement is None else contentElement converter = html2text.HTML2Text() converter.ignore_links = True updatedDate = response.css('.bpa-time > time::attr(datetime)').get() title = response.css('title::text').get().split(' | ')[-1] text = converter.handle(contentElement) isReliable, textBytesFound, details = cld2.detect(text) language = details[0].language_name yield { 'title': title, 'source': 'Presse- und Informationsamt der Bundesregierung', 'published': updatedDate, 'url': url, 'scraped': datetimeToday, 'classes': ['Government'], 'country': 'Germany', 'municipality': 'National', 'language': language, 'text': text }
def parse(self, response): now = datetime.utcnow().replace(microsecond=0).isoformat() url = response.url datetimeToday = now + 'Z' textContent = 'todo' dateElement = response.css('time::text').get() dateElementText = dateElement.replace('\t', '').replace( '\n', '').replace(' ', '').replace(' ', '') dateElementArray = dateElementText.split(',') updatedDateISO = dateparser.parse(dateElementArray[0], languages=['en']).date() updatedDateTime = str(updatedDateISO) title = response.css('h1::text').get() contentArray = response.css('p::text').extract() converter = html2text.HTML2Text() converter.ignore_links = True text = reduce( lambda first, second: converter.handle(first) + converter.handle( second), contentArray) isReliable, textBytesFound, details = cld2.detect(text) textMinusUnnecessaryChars = text.replace('\\', '') language = details[0].language_name yield { 'title': title, 'source': 'Georgia State Government', 'published': updatedDateTime, 'url': url, 'scraped': datetimeToday, 'classes': ['Government'], 'country': 'United States of America', 'municipality': 'Georgia', 'language': language, 'text': textMinusUnnecessaryChars }
def main(): for fn in sys.argv[1:]: with open(fn) as f: guesses = cld2.detect(f.read()) sys.stdout.write("{}: {}\n" .format(fn, " / ".join(dumplang(g) for g in guesses)))
def detect_language(text): # details is 3x (langName, langCode, percent, score) lang_is_reliable, _, lang_details = cld2.detect(text) lang_details = lang_details[0] # take only the first lang detected lang_name, lang_code, lang_percent, lang_score = lang_details return lang_name, lang_code, lang_score, lang_is_reliable
def do_content_extraction(args): docid, page, baseurl = args try: page = zlib.decompress(page) except: page = '' extr = html_extractor.ExtractedContent(baseurl, page) lang = cld2.detect(extr.text_pruned, want_chunks=True) segmented = [ { "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks ] pagelen = len(page) content = extr.text_content.encode("utf-8") chash = hashlib.sha256(content).digest() pruned = extr.text_pruned.encode("utf-8") phash = hashlib.sha256(pruned).digest() segmtd = json.dumps(segmented).encode("utf-8") heads = json.dumps(extr.headings).encode("utf-8") hhash = hashlib.sha256(heads).digest() links = json.dumps(extr.links).encode("utf-8") lhash = hashlib.sha256(links).digest() rsrcs = json.dumps(extr.resources).encode("utf-8") rhash = hashlib.sha256(rsrcs).digest() domst = json.dumps(extr.dom_stats.to_json()).encode("utf-8") dhash = hashlib.sha256(domst).digest() return (docid, pagelen, chash, content, phash, pruned, segmtd, hhash, heads, lhash, links, rhash, rsrcs, dhash, domst)
def do_content_extraction(args): page, url, locale, sources, access_time, result, detail, ourl, rurl = args page = zlib.decompress(page) pagelen = len(page) pagehash = hashlib.sha256(page).digest() extr = html_extractor.ExtractedContent(url, page) langs = cld2.detect(extr.text_pruned) return (zlib.compress(extr.text_pruned.encode("utf-8")), zlib.compress(extr.text_content.encode("utf-8")), zlib.compress(json.dumps(extr.headings).encode("utf-8")), zlib.compress(json.dumps(extr.links).encode("utf-8")), zlib.compress(json.dumps(extr.resources).encode("utf-8")), zlib.compress(json.dumps(extr.dom_stats.to_json()) .encode("utf-8")), langs[0].code, langs[0].percent, locale, sources, access_time, result, detail, ourl, rurl, pagelen, pagehash)
def check_message(message_content, pattern, set_lang="en"): lang_details = cld2.detect(message_content) lang = (lang_details[2][0][1]) if lang == set_lang or lang == "un": if re.search(pattern, message_content): return lang else: return None
def detect_language(text, proba_threshold): _, _, details = cld2.detect(text) language_code = details[0].language_code probability = details[0].percent if language_code != 'un' and probability > proba_threshold: return language_code
def do_segmentation(args): id, text = args lang = cld2.detect(text, want_chunks=True) segmented = [{ "l": c[0].code, "t": list(word_seg.segment(c[0].code, c[1])) } for c in lang.chunks] return id, quote_utf8_as_text(json.dumps(segmented).encode("utf-8"))
def is_english(document): '''checks if document is in English ''' import cld2 reliable, _, details = cld2.detect(document, bestEffort=True) return reliable and details[0][0] == 'ENGLISH' and details[0][2] >= MINENGPER
def detect_language(text): """Detect language using CLD2 library.""" try: # todo: figure out what's causing an occasional error _, _, details = cld2.detect(text) lang = details[0].language_code except ValueError: lang = 'un' return lang if lang != 'un' else None
def cld2_(text, label): try: result = cld2.detect(text.strip()) if result[2][0].language_code == label: return True except Exception as e: print(e) pass return False
def do_redetect(args): id, text = args try: text = zlib.decompress(text) except: text = '' langs = cld2.detect(text) return (id, json.dumps([{"l":l.code, "s":l.score} for l in langs.scores]))
def guess_lang_from_data(data, is_html, default_lang='en'): data = TextSanitizer.clean_utf8(data) # cld2 needs clean input reliable, text_bytes, detected_languages = cld2.detect( data.encode('utf-8', 'ignore'), isPlainText=(not is_html), useFullLangTables=True, bestEffort=True) if not reliable: return default_lang else: return detected_languages[0][1]
def fortnite_entries(file, entries, lang, limit): with open(file, 'r') as tsv_file: tsv_reader = csv.reader(tsv_file, delimiter='\t') for line in tsv_reader: lang_details = cld2.detect(line[2]) if lang_details[2][0][1] == lang: entries.append(Entries.EntryFortnite(line[0], line[1], line[2], line[3], lang_details[2][0][1])) if len(entries) > limit: break
def guess_lang_from_data(data, is_html, default_lang='en'): assert isinstance(data, unicode) data = TextSanitizer.clean_utf8(data) # cld2 needs clean input reliable, text_bytes, detected_languages = cld2.detect( data.encode('utf-8', 'ignore'), isPlainText=(not is_html), useFullLangTables=True, bestEffort=True) if not reliable: return default_lang else: return detected_languages[0][1]
def gen_sentences(fileobj): for line in fileobj: line = line.strip() if len(line.split(' ')) < 5 and len(line.encode('utf-8')) < 20: print >> sys.stderr, "skipping too-short line" continue if cld2.detect(line.encode('utf-8'))[2][0][0] == 'ENGLISH': print >> sys.stderr, "skipping English line" continue for sentence in tokenizer.tokenize(line): yield sentence
def predict(self, text): """ Predicting the language of a text. Parameters ---------- text : str """ is_reliable, text_bytes_found, details = cld2.detect(text, bestEffort=True) return self.map2wili(details[0].language_code)
def lang_detect(text): printable_str = ''.join(x for x in text if x.isprintable()) isReliable, textBytesFound, details = cld2.detect(printable_str) language_code = details[0].language_code if language_code != 'en' and language_code != 'ru': res = 0 elif language_code == 'ru': res = 2 elif language_code == 'en': res = 1 return res
def _guess_lang_from_data(data, is_html, default_lang='en'): assert isinstance(data, unicode) data = TextSanitizer.clean_utf8(data) # data = data.encode('utf-8', 'ignore').decode('utf-8', 'ignore') # print "broken", data.encode("utf-8")[17929 - 17933] reliable, text_bytes, detected_languages = cld2.detect( data.encode('utf-8', 'ignore'), isPlainText=(not is_html), useFullLangTables=True, bestEffort=True) if not reliable: return default_lang else: return detected_languages[0][1]
def getlanguageSpans(documentText): # Mimics what JANUS will do when detecting languages # Provides a span and it's language spanList = [] isReliable, textBytesFound, details, langVector = cld2.detect( documentText, returnVectors=True) for offset, num_bytes, lang_name, lang_code in langVector: textSpan = documentText[offset:offset + num_bytes] spanList.append((lang_code, textSpan)) return spanList
def check_language(text, languages): """Checks if text is written in any of the specified languages.""" import cld2 try: _, _, lang = cld2.detect(text) return lang[0].language_code in languages except Exception as cld_ex: # noqa # cld2 cannot handle some UTF-8 characters that Python can. See # https://github.com/mikemccand/chromium-compact-language-detector/issues/22 # There is a workaround, but I'd rather just call langid in this case import langid lang, _ = langid.classify(text) return lang in languages
def get_language(entry): """ hacky language detection for domains. Downloads the index page, and runs cld2 on the html """ index_url = entry.url.replace("robots.txt", "") # hack around some issues here, if entry.domain in KNOWN_LANGUAGES: language = KNOWN_LANGUAGES.get(entry.domain) else: try: page = requests.get(index_url) try: languages = cld2.detect( page.content, isPlainText=False, hintTopLevelDomain=entry.domain.split('.')[-1]) except: languages = cld2.detect( page.text.encode("utf8"), isPlainText=False, hintTopLevelDomain=entry.domain.split('.')[-1]) # ignoring 'is_reliable' flag here, set on baidu.com etc (even though detects # language appropiately language = languages.details[ 0].language_name if languages.details else 'Unknown' index_url = page.url except Exception as e: log.exception("Failed to analyze language for '%s'", entry.domain) language = 'Failed' language = language.title() # traditional chinese -> chinese if language == 'Chineset': language = 'Chinese' return language, not urlparse(index_url).netloc.endswith(entry.domain)
def main(): wrapper = textwrap.TextWrapper(initial_indent=" ", subsequent_indent=" ") for fn in sys.argv[1:]: with open(fn) as f: result = cld2.detect(f.read(), want_chunks=True) sys.stdout.write("{}: {}\n" .format(fn, " / ".join(dumplang(g) for g in result.scores))) for i, chunk in enumerate(result.chunks): sys.stdout.write(" Chunk {}: {}={}\n" .format(i+1, chunk[0].code, chunk[0].name)) sys.stdout.write(wrapper.fill(chunk[1][:2000])) sys.stdout.write("\n\n")
def do_content_extraction(args): page, url, locale, sources, access_time, result, detail, ourl, rurl = args page = zlib.decompress(page) pagelen = len(page) pagehash = hashlib.sha256(page).digest() extr = html_extractor.ExtractedContent(url, page) langs = cld2.detect(extr.text_pruned) return (zlib.compress(extr.text_pruned.encode("utf-8")), zlib.compress(extr.text_content.encode("utf-8")), zlib.compress(json.dumps(extr.headings).encode("utf-8")), zlib.compress(json.dumps(extr.links).encode("utf-8")), zlib.compress(json.dumps(extr.resources).encode("utf-8")), zlib.compress( json.dumps(extr.dom_stats.to_json()).encode("utf-8")), langs[0].code, langs[0].percent, locale, sources, access_time, result, detail, ourl, rurl, pagelen, pagehash)
def detect(text): text = illegalChars.sub('', text) text = text.encode('utf-8') try: isReliable, textBytesFound, details = cld2.detect(text) except: logging.exception('Error processing text: %r', text) return None if isReliable: # top language, get the language code: # details: (('ENGLISH', 'en', 95, 1736.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) return details[0][1] else: return None
def language(self): """ Detect the language of a text if no language was provided along with the text >>> doc = Doc('Test sentence for testing text', language='en') >>> doc.language 'en' >>> doc = Doc('Test sentence for testing text') >>> doc.language 'en' """ if not self._language: _, _, best_guesses = cld2.detect(self.clean_text, hintLanguage=self._hint_language, bestEffort=True) self._language = best_guesses[0][1] return self._language
def translate_line_or_not(line, src_lang, translator): # -> (translation, tag) if (not line or line.startswith('<') and line.endswith('>')): return line, '<>' try: isReliable, _, lang_details = cld2.detect(line) except ValueError: return line, 'ValueError' if not isReliable or lang_details[0][1].decode() != src_lang: return line, lang_details[0][1].decode() trans_line = translator(line) return trans_line, src_lang
def cld2_(text, label): score = 0 try: result = cld2.detect(text.strip()) if result[2][0].language_code in label: score += 1 if result[2][1].language_code in label: score += 1 if result[2][0].language_code == label[0]: score += 2 if result[2][0].language_code == label[0] and result[2][ 1].language_code == label[1]: score += 3 except Exception as e: print(e) pass return score
def detect_tweet_lang(text): try: #Remove mentions and URLs before trying to detect language text = RE_MENTION.sub(" ", text) text = RE_URL.sub(" ", text) vals = {} text = text.encode("UTF-8") vals["cld_reliable"], vals["cld_bytes"], details = cld2.detect(text) if len(details) > 1: vals["cld_lang1"] = details[0][1] vals["cld_lang1_percent"] = details[0][2] if len(details) > 2: vals["cld_lang2"] = details[1][1] vals["cld_lang2_percent"] = details[1][2] return vals except Exception as e2: logging.debug("CLD error: " + str(e2) + "\n" + text)
def detect_from_document(self, document, html=None): return cld2.detect( html or document.source_data, isPlainText=False, useFullLangTables=True, hintTopLevelDomain=document.get_url(). suffix, # TODO: doesn't seem to have any influence? hintLanguage=None, hintLanguageHTTPHeaders=None, # TODO from headers hintEncoding=None, # TODO from headers returnVectors=False, debugScoreAsQuads=False, debugHTML=False, bestEffort=True, debugCR=False, debugVerbose=False, debugQuiet=True, debugEcho=False)
def detect_from_document(self, document, html=None): return cld2.detect( html or document.source_data, isPlainText=False, useFullLangTables=True, hintTopLevelDomain=document.get_url().suffix, # TODO: doesn't seem to have any influence? hintLanguage=None, hintLanguageHTTPHeaders=None, # TODO from headers hintEncoding=None, # TODO from headers returnVectors=False, debugScoreAsQuads=False, debugHTML=False, bestEffort=True, debugCR=False, debugVerbose=False, debugQuiet=True, debugEcho=False )
def do_content_extraction(args): origin, page, baseurl = args try: page = zlib.decompress(page) except: page = '' pagelen = len(page) extr = html_extractor.ExtractedContent(baseurl, page) langs = cld2.detect(extr.text_pruned) pcontent = zlib.compress(extr.text_pruned.encode("utf-8")) phash = hashlib.sha256(pcontent).digest() headings = zlib.compress(json.dumps(extr.headings).encode("utf-8")) links = zlib.compress(json.dumps(extr.links).encode("utf-8")) resources = zlib.compress(json.dumps(extr.resources).encode("utf-8")) domstats = zlib.compress(json.dumps(extr.dom_stats.to_json()).encode("utf-8")) return (origin, pagelen, phash, langs[0].code, langs[0].percent, pcontent, links, resources, headings, domstats)
def do_content_extraction(args): origin, page, baseurl = args try: page = zlib.decompress(page) except: page = '' pagelen = len(page) extr = html_extractor.ExtractedContent(baseurl, page) langs = cld2.detect(extr.text_pruned) pcontent = zlib.compress(extr.text_pruned.encode("utf-8")) phash = hashlib.sha256(pcontent).digest() headings = zlib.compress(json.dumps(extr.headings).encode("utf-8")) links = zlib.compress(json.dumps(extr.links).encode("utf-8")) resources = zlib.compress(json.dumps(extr.resources).encode("utf-8")) domstats = zlib.compress( json.dumps(extr.dom_stats.to_json()).encode("utf-8")) return (origin, pagelen, phash, langs[0].code, langs[0].percent, pcontent, links, resources, headings, domstats)
def parse(self, response): now = datetime.utcnow().replace(microsecond=0).isoformat() url = response.url datetimeToday = now + 'Z' textContent = 'todo' dateElement = response.css('time::text').get() dateElementText = dateElement.replace('\t', '').replace( '\n', '').replace(' ', '').replace(' ', '') dateElementArray = dateElementText.split(',') updatedDateISO = dateparser.parse(dateElementArray[0], languages=['en']).date() updatedDateTime = str(updatedDateISO) tempTitle = response.css('.content-title.text-center::text').get() if (tempTitle == '\n '): title = response.css(".field--item::text").get() else: title = tempTitle contentArray = response.css('p::text').extract() converter = html2text.HTML2Text() converter.ignore_links = True text = reduce( lambda first, second: converter.handle(first) + converter.handle( second), contentArray) isReliable, textBytesFound, details = cld2.detect(text) textMinusUnnecessaryChars = text.replace( "Federal government websites often end in .gov or .mil. Before sharing\nsensitive information, make sure you're on a federal government site. The\nensures that you are connecting to the official website and that any\ninformation you provide is encrypted and transmitted securely.", "").replace('\\', '') language = details[0].language_name yield { 'title': title, 'source': 'Federal Dept. Of Agriculture', 'published': updatedDateISO, 'url': url, 'scraped': datetimeToday, 'classes': ['Government'], 'country': 'United States of America', 'municipality': 'Federal Government', 'language': language, 'text': textMinusUnnecessaryChars }
def should_be_good_language(self, lang_interface="fil"): lang_interface = MainPageLocators.LANGUAGE_BUTTONS_DICT[ lang_interface][1] text_on_page = "" for i in range(1, 6): text_on_page = text_on_page + WebDriverWait(self.browser, 5).until( EC.presence_of_element_located( MainPageLocators.HEADER_DICT[i])).text + " " title = WebDriverWait(self.browser, 5).until( EC.visibility_of_element_located(MainPageLocators.TITLE_TEXT)).text text_on_page += title for i in range(1, 9): text_on_page = text_on_page + " " + WebDriverWait( self.browser, 5).until( EC.presence_of_element_located( MainPageLocators.COLUMNS_DICT[i])).text isReliable, textBytesFound, lang_lit = cld2.detect(text_on_page) detect_lang = lang_lit[0][1] assert detect_lang == lang_interface, f"The title was not changed, detected {detect_lang}, used {lang_interface}"
def cld2_detector(text): """ cld2 library functionality implementation for the language detection API this library is used for multiple language detection Args: text: (string) text value sent for language detection Returns: (dict) detected languages with its confidence values or (boolean) if error returns false """ try: text = ''.join(x for x in text if x in string.printable) result = cld2.detect(text.strip()) if result[2][1].language_code != "un": return [(result[2][0].language_code, result[2][0].percent), (result[2][1].language_code, result[2][1].percent)] else: return [(result[2][0].language_code, result[2][0].percent)] except Exception as e: return str(e)
def language_code_for_text(text: str): """Returns an ISO 690 language code for the plain text passed as a parameter. :param text: Text that should be identified :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure """ text = decode_object_from_bytes_if_needed(text) if not text: return '' if len(text) > __MAX_TEXT_LENGTH: log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH) text = text[:__MAX_TEXT_LENGTH] # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8 text = __recode_utf8_string(text) try: is_reliable, text_bytes_found, details = cld2.detect( utf8Bytes=text, useFullLangTables=True) except Exception as ex: log.error("Error while detecting language: %s" % str(ex)) return '' if not details: return '' best_match = details[0] language_name = best_match.language_name.lower() language_code = best_match.language_code.lower() if language_name in {'unknown', 'tg_unknown_language' } or language_code == 'un': return '' if not language_is_supported(language_code): return '' return language_code
def language_code_for_text(text: str): """Returns an ISO 690 language code for the plain text passed as a parameter. :param text: Text that should be identified :return: ISO 690 language code (e.g. 'en') on successful identification, empty string ('') on failure """ text = decode_object_from_bytes_if_needed(text) if not text: return '' if len(text) > __MAX_TEXT_LENGTH: log.warning("Text is longer than %d, trimming..." % __MAX_TEXT_LENGTH) text = text[:__MAX_TEXT_LENGTH] # We need to verify that the file can cleany encode and decode because CLD can segfault on bad UTF-8 text = __recode_utf8_string(text) try: is_reliable, text_bytes_found, details = cld2.detect(utf8Bytes=text, useFullLangTables=True) except Exception as ex: log.error("Error while detecting language: %s" % str(ex)) return '' if not details: return '' best_match = details[0] language_name = best_match.language_name.lower() language_code = best_match.language_code.lower() if language_name in {'unknown', 'tg_unknown_language'} or language_code == 'un': return '' if not language_is_supported(language_code): return '' return language_code
def is_slovene(content): a, b, languages = cld2.detect(content) return 'SLOVENIAN' in [x[0] for x in languages]