def detect(text): prediction, confidence, isreliable, proportion = cld3.get_language( text) if (confidence >= 0.5): return prediction else: return cld3.get_language(clean(text))[0]
def get_misclassification_stats(dataset): total_tweets = 0 misclassified_cld = 0 misclassified_langid = 0 for index, t in enumerate(dataset): if len(t) > 0: #print(index, t) # Remove mentions, URLs, and hashtags t = re.sub(r"(?:\@|\#|https?\://)\S+", "", t) t = " ".join(t.split()[:8]) if len(t) > 0: cld_prediction = cld3.get_language(t) langid_prediction = langid.classify(t) if cld_prediction[0] != "en": # print(t, cld_prediction[0]) misclassified_cld += 1 if langid_prediction[0] != "en": # print(t, langid_prediction[0]) misclassified_langid += 1 total_tweets += 1 print("CLD accuracy: ", total_tweets - misclassified_cld, "/", total_tweets, "=", (total_tweets - misclassified_cld) / total_tweets) print("Langid accuracy: ", total_tweets - misclassified_langid, "/", total_tweets, "=", (total_tweets - misclassified_langid) / total_tweets)
def find_new_links(htmlstring, base_url, known_links, language=None, rules=None): """Extract and filter new internal links after an optional language check.""" new_links = [] # reference=None # optional language check: run baseline extraction + language identifier if language is not None and LANGID_FLAG is True: _, text, _ = baseline(htmlstring) result = cld3.get_language(text) if result is not None and result.language != language: return new_links, known_links # iterate through the links and filter them for link in extract_links(htmlstring, base_url, False, language=language, with_nav=True): # check robots.txt rules if rules is not None and not rules.can_fetch("*", link): continue # sanity check if is_known_link(link, known_links) is True or is_not_crawlable(link): continue new_links.append(link) known_links.add(link) return new_links, known_links
def parse_item(self, response): article = response.css('section.article-column > div.article-text') heading = article.css('h1::text') date = article.css('div.item.time::text') sub_heading = article.css('div.like-h2::text').extract_first() if not sub_heading: sub_heading = article.css('h2::text').extract_first() text = article.css('div').xpath('p//text()').extract() if sub_heading: body = [sub_heading] + text else: body = text body = ' '.join(body) lang = cld3.get_language(body).language tags = article.css('div.tags > a::text') yield { 'heading': heading.extract_first(), 'date': date.extract_first(), 'tags': tags.extract(), 'lang': lang, 'body': body }
def convert_warc_to_csv(arguments): counter = 0 with open(arguments["input_path"], 'rb') as input_file, \ open(arguments["output_path"], "w", newline='') as output_file: writer = csv.writer(output_file, delimiter=',', quotechar='"') for record in ArchiveIterator(input_file): if record.rec_type == 'response': if record.http_headers.get_header( 'Content-Type') == 'text/html': html = record.content_stream().read() clean_text = clean_html(html) if len(clean_text) > 0: language_prediction = cld3.get_language(clean_text) if language_prediction.language == arguments[ "language"]: writer.writerow([clean_text, language_prediction]) counter += 1 if counter >= int(arguments["count"]): return if counter % 100 == 0: logger.info("Saved " + str(counter) + " websites")
def _cld3_detection(self, doc: AnyStr) -> (AnyStr, float): language_detection_object = cld3.get_language(doc) lang_id = language_detection_object.language[:2] for original_code, new_code in LANGUAGE_REMAPPING.items(): # make cld3 compatible with langid lang_id = lang_id.replace(original_code, new_code) lang_probability = float(language_detection_object.probability) return (lang_id, lang_probability)
def get_lange_cld(text, get_prob=False): output = cld3.get_language(text) lang = output.language if get_prob == False: return lang else: return lang, output.probability
def _cld3_detection(self, doc: AnyStr) -> (AnyStr, float): """Detect the language of a string using the `cld3` library""" language_detection_object = cld3.get_language(doc) lang_id = language_detection_object.language[:2] for original_code, new_code in LANGUAGE_REMAPPING_PYCLD3_LANGID.items( ): # make cld3 compatible with langid lang_id = lang_id.replace(original_code, new_code) lang_probability = float(language_detection_object.probability) return (lang_id, lang_probability)
def language_filter(temp_text, temp_comments, target_language, docmeta): '''Run external component (if installed) for language identification''' # sanity check on language if target_language is not None: if LANGID_FLAG is True: # comments if len(temp_comments) > len(temp_text): result = cld3.get_language(temp_comments) # default else: result = cld3.get_language(temp_text) if result.language != target_language: LOGGER.warning('wrong language: %s %s %s', result, docmeta['id'], docmeta['url']) return True else: LOGGER.warning('Detector not installed, no language detection run') return False
def _validate_language(text, language): if language not in CLD3_LANG_CODES: return True lang_res = cld3.get_language(text) if lang_res.is_reliable and lang_res.language == language: return True return False
def detect(): content = request.get_json() sentence = content['sentence'] prediction = cld3.get_language(sentence) return jsonify({ "lang": prediction.language, "probability": prediction.probability, "is_reliable": prediction.is_reliable })
def _detect_language(self, text): """Tries to detect the language of a text input. Outputs a BCP-47-style language code (e.g. 'en').""" lan_info = cld3.get_language(text) if lan_info is not None and lan_info.is_reliable: return lan_info.language else: return None
def remove_mixed_language_items(samples): to_be_removed = [] for i, item in enumerate(samples): language_info = cld3.get_language(item['text']) if language_info.probability < 0.95: to_be_removed.append(i) for i in reversed(range(len(to_be_removed))): idx = to_be_removed[i] del samples[idx] return samples
def detect_lang(self, text): src_lang, _, is_reliable, _ = cld3.get_language(text) if not is_reliable: os.system( 'say "Not certain which language that is. Please decide."') print("Text:\n{}".format(text)) src_lang = input( "Please enter source language abbreviation after scheme in CLD3 github:" ) return src_lang
def load_covid_data(): with open('covid.csv') as csvfile: tip_line_requests = csv.reader(csvfile) tip_line_requests = [item for item in tip_line_requests] csv_headers = tip_line_requests[0] tip_line_requests = tip_line_requests[1:] temp_tip_line_requests = [] for row in tip_line_requests: item = {} for i, key in enumerate(csv_headers): item[key] = row[i] temp_tip_line_requests.append(item) tip_line_requests = temp_tip_line_requests tip_line_requests = [ tip for tip in tip_line_requests if tip['claim_type'] == 'Claim' ] for tip in tip_line_requests: tip['text'] = remove_emoji( tip['media_text'] if tip['media_text'] != 'NA' and len(tip['media_text']) >= len(tip['media_title']) else tip['media_title']) lang_data = cld3.get_language(tip['text']) if lang_data is not None: tip['language'] = lang_data.language tip_line_requests = [ tip for tip in tip_line_requests if tip['text'] != 'NA' and not tip['text'].isspace() and 'language' in tip ] partners = set([item['team_slug'] for item in tip_line_requests]) temp_tip_line_requests = {} for partner in partners: partner_tips = [ item for item in tip_line_requests if item['team_slug'] == partner ] temp_tip_line_requests[partner] = { lang: [] for lang in partner_languages[partner] } for tip in partner_tips: if tip['language'] in partner_languages[partner]: tip['embedding'] = get_sentence_embedding( tip['text'], tip['language']) temp_tip_line_requests[partner][tip['language']].append(tip) for language in partner_languages[partner]: temp_tip_line_requests[partner][ language] = remove_duplicate_requests( temp_tip_line_requests[partner][language]) tip_line_requests = temp_tip_line_requests return partners, tip_line_requests
def get_lang(self): if self.successfully_read and self._lang is None: #Extracting actual content of the page and checking language utf_text_to_deboilerpipe = re.sub(r'<?xml.*encoding.*?>', '<?xml version="1.0"?>', self.utf_text) try: article = alcazar.bodytext.parse_article( utf_text_to_deboilerpipe) if article.body_text: self._lang = cld3.get_language(article.body_text) except: self._lang = None return self._lang
def group_tiplines_by_language(tip_line_requests, languages=['en', 'pt', 'hi', 'hi-Latn', 'mr', 'bn', 'ta', 'te', 'ml']): for tip in tip_line_requests: tip['text'] = remove_emoji( tip['media_text'] if tip['media_text'] != 'NA' and len(tip['media_text']) >= len(tip['media_title']) else tip['media_title']) lang_data = cld3.get_language(tip['text']) if lang_data is not None: tip['language'] = lang_data.language tip_line_requests = [tip for tip in tip_line_requests if tip['text'] != 'NA' and not tip['text'].isspace() and 'language' in tip and len(tip['text']) > 20] temp_tip_line_requests = {} for language in languages: temp_tip_line_requests[language] = [item for item in tip_line_requests if item['language'] == language] tip_line_requests = temp_tip_line_requests return tip_line_requests
def get_summary(text: str, percentage: float = None, abstractive: bool = False): if get_language(text).language != 'hi': raise HTTPException(status_code=418, detail="Summarization only available for Hindi.") summary = Summary(text, percentage, abstractive) response_length = len(summary) original_length = len(text) return { "summary": summary, "response_length": response_length, "original_length": original_length }
def detect_lang_neural(text, return_multiple=False, return_dict=False, hint_language=None, filter_unreliable=False): if cld3 is None: LOG.debug("run pip install pycld3") raise ImportError("pycld3 not installed") languages = [] if return_multiple or hint_language: preds = sorted(cld3.get_frequent_languages(text, num_langs=5), key=lambda i: i.probability, reverse=True) for pred in preds: if filter_unreliable and not pred.is_reliable: continue if return_dict: languages += [{ "lang_code": pred.language, "lang": code_to_name(pred.language), "conf": pred.probability }] else: languages.append(pred.language) if hint_language and hint_language == pred.language: languages = [languages[-1]] break else: pred = cld3.get_language(text) if filter_unreliable and not pred.is_reliable: pass elif return_dict: languages = [{ "lang_code": pred.language, "lang": code_to_name(pred.language), "conf": pred.probability }] else: languages = [pred.language] # return top language only if not return_multiple: if not len(languages): return None return languages[0] return languages
def song_to_lines(song, min_lines=20, acceptable_languages=["en"]): """ takes a song and returns an array where each line is an element """ if "lyrics" in song and song["lyrics"]: lines = song["lyrics"].lower().split("\n") if len(lines) >= min_lines: lang_prediction = cld3.get_language(song["lyrics"]) if (lang_prediction.is_reliable and lang_prediction.language in acceptable_languages): return lines else: return [] else: return [] else: return []
def parse_news(self, response): self.log(response.body) article = response.css('div.article-content') heading = article.css('div.title > h1::text') date = article.css('div.title > div > span::text') text = article.css('span._ga1_on_').xpath('p//text()') body = ' '.join(text.extract()) lang = cld3.get_language(body).language tags = response.css('div.article-content > div.tag > a::text') yield { 'heading': heading.extract_first(), 'date': date.extract_first(), 'lang': lang, 'tags': tags.extract(), 'body': body }
def parse_article(self, response): article = response.css('main.main-col > div.main-col__left') heading = article.css( 'div.news-full__head > h1.news-full__title::text') date = article.css('div.news-full__head > time.news-full__date::text') text = article.css('div.news-full__text').xpath('p//text()') tags = article.css('div.news-full-tags > div > span::text') body = ' '.join(text.extract()) lang = cld3.get_language(body).language yield { 'heading': heading.extract_first(), 'date': date.extract_first(), 'lang': lang, 'tags': tags.extract(), 'url': response.request.url, 'body': body }
def run(self): print("Connected: " + str(self.address)) while True: try: request = self.socket.recv(4096) request = request.decode('utf-8') if not request: break print("Text in: " + request) result = str(cld3.get_language(request)) print("Result: " + result) client.send(str.encode(result)) print() #lock.release() except socket.timeout as e: print(str(e)) break except UnicodeDecodeError as e: print("Decode Error") client.send(str.encode("X")) self.socket.close() print("Disconnect")
def save(self, *args, **kwargs): if isinstance(self.title, str) and len(self.title) is 0: self.title = None if isinstance(self.body, str) and len(self.body) is 0: self.body = None if not self.sketch or kwargs.pop('generate', False): if self.body: self.html = generate.html(self.body) self.raw = generate.raw(self.html) if self.raw and kwargs.pop('update_lead', True): self.lead = generate.lead(self.raw) if self.title and kwargs.pop('update_slug', True): self.slug = generate.slug(self.title, uuid=self.uuid) if not self.published_at and not self.sketch: self.published_at = timezone.now() if not self.sketch or kwargs.pop('update_language', False): if self.body and len(self.body) > self.LANG_MIN_LEN: cleaned = generate.clean(self.body) result = cld3.get_language(cleaned) logger.debug( f'NarrativeTranslation language classification results: {result}' ) if result.is_reliable: language = result.language.split('-')[0][:5] self.language = language else: logger.debug( f'NarrativeTranslation failed language classification.' ) else: logger.debug( f'NarrativeTranslation skipped language classification.') super().save(*args, **kwargs)
def test_get_language(self): self.assertIsNone(cld3.get_language("")) self.assertIsNone(cld3.get_language(None)) self.assertEqual( cld3.get_language("影響包含對氣候的變化以及自然資源的枯竭程度").language, # noqa "zh", ) self.assertEqual( cld3.get_language("This is a test").language, "en", ) res = cld3.get_language("وفي وقت سابق اليوم السبت قالت الرئاسة المصرية -في بيان- إنها تتطلع لقيام الولايات المتحدة بدور فعال، خاصة في ضوء وصول المفاوضات بين الدول الثلاث لطريق مسدود.") # noqa self.assertEqual(res.language, "ar") res = cld3.get_language("مغلوں کی خام اور سفید و سیاہ میں تصویر کشی دراصل مودی کی دائیں بازو والی بی جے پی حکومت کے اقتدار میں بھارتی مسلمانوں سے روا رکھے جانے سلوک کو درست ٹھہرانے کی کوشش کے سوا کچھ نہیں۔ ") # noqa self.assertEqual(res.language, "ur")
def group_tiplines_by_language( tip_line_requests, languages=['en', 'pt', 'hi', 'mr', 'bn', 'ta', 'te', 'ml']): for tip in tip_line_requests: tip['text'] = remove_emoji( tip['media_text'] if tip['media_text'] != 'NA' and len(tip['media_text']) >= len(tip['media_title']) else tip['media_title']) lang_data = cld3.get_language(tip['text']) if lang_data is not None and lang_data.probability >= 0.95: tip['language'] = lang_data.language tip_line_requests = [ tip for tip in tip_line_requests if tip['text'] != 'NA' and not tip['text'].isspace() and 'language' in tip and ( 60 <= len(tip['text']) <= 1200) and not contains_url(tip['text']) and not contains_phone_number(tip['text']) ] return [{ 'text': item['text'], 'language': item['language'], 'source': SourceName.TIPLINE.value } for item in tip_line_requests if item['language'] in languages]
def get_lid(line, threshold = 150): #CLD2 if len(line) > threshold: try: isReliable, textBytesFound, details = cld2.detect(line, isPlainText = True) code_cld2 = details[0][1] code_cld2 = mapping_dict[code_cld2] except: code_cld2 = "ukn" #CLD3 try: prediction = cld3.get_language(line) code_cld3 = prediction[0] code_cld3 = mapping_dict[code_cld3] except: code_cld3 = "ukn" else: code_cld2 = "ukn" code_cld3 = "ukn" return code_cld2, code_cld3
def detect(self, query): lang_prediction = cld3.get_language(query) return lang_prediction
import csv import time import cld3 with open('lang_detect_test.csv', 'r', encoding='utf-8', newline='') as f: rdr = csv.reader(f, delimiter=',') point = 0 lines = 0 start_time = time.time() for line in rdr: lines += 1 lang = line[0] detected = cld3.get_language(line[1]) if lang == detected.language: point += 1 else: print( f'{line[1]}. expected = {lang}, result = {detected.language}.') accuracy = (point / lines) * 100 print(f'accuracy = {accuracy}% . elapsed={time.time() - start_time}')
] #for i in kcu: #cnt+=1 fsn = "CoreBotTweetsCombinedEN.csv" #dff = pd.DataFrame([["The", " Core Bot UserID", " is:", " "+str(i)]], columns=["tweetid", "tweet_text", "hashtags", "urls"]) #dff[["tweet_text"]] = dff["tweet_text"].apply(translator.translate, dest='en').apply(getattr, args=('text',)) #dff[["tweetid", "tweet_text", "hashtags", "urls"]].to_csv(fsn, mode='a', header=i, index=False) for df_ in dfn: #translators = Translator(to_lang='en', from_lang='ru') df_lst.iloc[0:0] #gs = goslate.Goslate() t0 = time.time() df_lst = df_.loc[df_["userid"].map(lambda x: x is not None)] for z in eng: df_lst["Yes"] = df_["tweet_text"].apply( lambda x: "true" if str(z).lower() in str(x).lower( ) and cld3.get_language(str(x)).language == 'en' else "false") df_lst = df_lst.loc[df_lst["Yes"].map(lambda x: x == "true")] #pdb.set_trace() #print(df_.tweet_text) #print(df_["tweet_text"].apply(lambda x: translators.translate(x))) #df_lst["tweet_text"] = df_lst.tweet_text.apply(lambda x: gs.translate(x, 'en') df_lst.insert(14, "language", "en") df_lst = df_lst.dropna(subset=["tweet_text"]) df_lst[["tweetid", "userid", "tweet_text", "hashtags", "urls", "language"]].to_csv(fsn, mode='a', header=False, index=False) t1 = time.time() print(t1 - t0)