예제 #1
0
def DetectLanguageForScan(filePath, countryLanguage, sample):
    pdfReader = PyPDF2.PdfFileReader(filePath)
    pdfWriter = PyPDF2.PdfFileWriter()
    pagesNumber = CountPagesNumber(filePath)
    lan1 = ""
    temporaryFolderPath = "Temporary Folder"
    if os.path.exists(temporaryFolderPath):
        shutil.rmtree(temporaryFolderPath)
    os.makedirs(temporaryFolderPath)
   
    while (len(lan1) < sample):
        randomPage = random.sample(range(0, pagesNumber), 1)[0]
        pdfWriter.addPage(pdfReader.getPage(randomPage))
        temporaryFilePath = temporaryFolderPath +"/out_" + str(randomPage) + ".pdf"
        
        stream = open(temporaryFilePath, "wb")
        pdfWriter.write(stream)
        stream.close()
        
        lan1 = "" + pytesseract.image_to_string(pdf2image.convert_from_path(temporaryFilePath)[0], lang=countryLanguage)
        
        if (len(lan1) < sample):
            os.remove(temporaryFilePath)
        
    lan2 = "" + pytesseract.image_to_string(pdf2image.convert_from_path(temporaryFilePath)[0],lang='eng')
    shutil.rmtree(temporaryFolderPath)
    out_lan1 = langdetect.detect_langs(lan1)[0]
    out_lan2 = langdetect.detect_langs(lan2)[0]
    lang=re.findall(r"[a-zA-Z]+",str(max(out_lan1,out_lan2)))[0]
    
    return lang
예제 #2
0
def get_language():
    "检测语言是否为中文"
    from Ref_Data import replace_word
    import json
    from langdetect import detect_langs
    from langdetect.lang_detect_exception import LangDetectException
    train = input.read_dataset('train.csv').fillna(replace_word['unknow'])
    test  = input.read_dataset('test.csv').fillna(replace_word['unknow'])

    records = {}

    for index, row in tqdm(train.iterrows()):
        try:
            lang_prob = detect_langs(row['comment_text'])
            language = lang_prob[0].lang
            if language != 'en':
                records['tr' + str(index)] = (row['comment_text'], language, lang_prob[0].prob)
        except LangDetectException:
            records['tr' + str(index)] = (row['comment_text'], 'none',0)

    for index, row in tqdm(test.iterrows()):
        try:
            lang_prob = detect_langs(row['comment_text'])
            language = lang_prob[0].lang
            if language != 'en':
                records['te' + str(index)] = (row['comment_text'], language, lang_prob[0].prob)
        except LangDetectException:
            records['te' + str(index)] = (row['comment_text'], 'none',0)
    records = sorted(records.items(), key=lambda item: item[1][2], reverse=True)
    with open('language_record.json', 'w') as f:
        f.write(json.dumps(records, indent=4, separators=(',', ': '),ensure_ascii=False))
예제 #3
0
def language_check(dataframe=None):
    """
    Fucntion responsible to check whether a song lyric is in English Language or not.
    """
    
    index_to_remove = []
    
    progress_bar = tqdm(dataframe[~dataframe['lyrics'].isnull()].index.to_list())

    for index in progress_bar:
        
        if isinstance(index, tuple):
            progress_bar.set_description("Processing %s" % index[0] + ' , ' + index[1])
        else:
            progress_bar.set_description("Processing %s" % index)
            
        
        try:
            if isinstance(index, tuple):
                if 'en' not in [item.lang for item in detect_langs(dataframe['lyrics'].loc[index[0]].loc[index[1]])]:
                    index_to_remove.append(index)
            else:
                if 'en' not in [item.lang for item in detect_langs(dataframe['lyrics'].loc[index])]:
                    index_to_remove.append(index)
                
        except:
            index_to_remove.append(index)
            
    
    return index_to_remove
예제 #4
0
def ConvertFileToText(path, language):
    text = ConvertPdftoText(path)
    pagesNumber = CountPagesNumber(path)
    scannedFile = 0
    
    if text in ["\x0c" * pagesNumber, ""]:
        scannedFile = 1
        text = ConvertScanToText(path, language)
        
    languageEstimated = LanguageName(str(langdetect.detect_langs(text))[1:3])
    
    # If the pdf language is confusing, extract the text with a more precise tool (but less efficient)
    if ((LanguageName(str(langdetect.detect_langs(text))[1:3]) != language) & (scannedFile == 0)):
        prm = PDFResourceManager()
        iob = io.BytesIO()
        device = TextConverter(prm, iob, codec = "utf-8", laparams = LAParams())
        pdf = open(path, "rb")
        interpreter = PDFPageInterpreter(prm, device)
        for page in PDFPage.get_pages(pdf, set(), maxpages = 0, password = "", caching = True, check_extractable = True):
            interpreter.process_page(page)
        text = iob.getvalue()
        pdf.close()
        device.close()
        iob.close()
        
        languageEstimated = LanguageName(str(langdetect.detect_langs(text))[1:3])
        
    return text, scannedFile, languageEstimated
def auto_detect_text(filename):
    import platform
    if platform.system() != 'Darwin':
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    text_eng = pytesseract.image_to_string(Image.open(filename), lang='eng')
    text_spa = pytesseract.image_to_string(Image.open(filename), lang='spa')
    text_fra = pytesseract.image_to_string(Image.open(filename), lang='fra')
    text_hin = pytesseract.image_to_string(Image.open(filename), lang='hin')

    possible_languages = []
    lang_text_map = {}

    if text_eng:
        possible_languages.append(str(detect_langs(text_eng)[0]).split(":"))
        lang_text_map['en'] = text_eng
    if text_spa:
        possible_languages.append(str(detect_langs(text_spa)[0]).split(":"))
        lang_text_map['es'] = text_spa
    if text_fra:
        possible_languages.append(str(detect_langs(text_fra)[0]).split(":"))
        lang_text_map['fr'] = text_fra
    if text_hin:
        possible_languages.append(str(detect_langs(text_hin)[0]).split(":"))
        lang_text_map['hi'] = text_hin

    if possible_languages:
        res = max(possible_languages, key=lambda li: li[1])
        if res[0] not in ACCEPTED_LANGUAGES:
            return 'XX', 'XX'
        else:
            return lang_text_map[res[0]], res[0]

    else:
        return 'XX', 'XX'
def which_lan(cap_clean,rownum):
    try:
        lang_temp = Series(str(detect_langs(cap_clean[rownum])[0]).split(':')[0])
        lang_prob_temp = Series(str(detect_langs(cap_clean[rownum])[0]).split(':')[1])
    except:
        return None
    print rownum
    res = concat([lang_temp, lang_prob_temp], axis=1)
    return res 
예제 #7
0
 def judge_pure_english(self, text):
     try:
         lang1 = detect_langs(text)[0]
     except UnicodeDecodeError:
         lang1 = detect_langs(text.decode("utf-8"))[0]
     prob = lang1.prob
     lang = lang1.lang
     if prob > 0.90:
         return lang
     return None
예제 #8
0
def data_from_CSV():
    conn = establish_DB_connection_SQL()
    df = pd.read_csv(sys.argv[1], keep_default_na=False)
    tweets = df.set_index('status_id', drop=False)
    for index, row in tweets.iterrows():
        phone_no = ''
        hashtag = []
        text = str(row['text'])

        if str(detect_langs(text)[0])[0:2] == 'hi' and text != 'nan':
            text1 = text
            authenticator = IAMAuthenticator(
                'worUzb_Eb5emCaIs0oL7sR86Fb2LeTJGOk1EN1Q-4Cni')
            language_translator = LanguageTranslatorV3(
                version='2018-05-01', authenticator=authenticator)

            language_translator.set_service_url(
                'https://api.eu-gb.language-translator.watson.cloud.ibm.com/instances/cdafbc7e-b59a-40f8-818f-1914f02063cc'
            )
            translation = language_translator.translate(
                text=text1, model_id='hi-en').get_result()
            output = json.loads(
                json.dumps(translation, indent=2, ensure_ascii=False))
            text = output['translations'][0]['translation']

        if str(detect_langs(text)[0])[0:2] == 'en' and text != 'nan':
            text = data_cleaning(text)
            words = text.split()
            words = set(words)
            words = list(words)
            for w in words:
                if re.match(
                        '^((\+){0,1}91(\s){0,1}(\-){0,1}(\s){0,1}){0,1}0{0,1}[1-9]{1}[0-9]{9}$',
                        w):
                    phone_no = w[-10:]
                if w.startswith('#'):
                    hashtag.append(w)
            request_type = processML(text)

            if request_type != '':
                name = str(row['screen_name'])
                if name == '': name = None
                if text == '': text = None
                if phone_no == '': phone_no = 9900990099
                latitude = 9.9252
                longitude = 78.1198
                updateDB_SQL(row['screen_name'], text, latitude, longitude,
                             phone_no, request_type, conn)
    disconnect_DB_SQL()
예제 #9
0
def _detect_subtitle_language(srt_path):
    log.debug('Detecting subtitle language')

    # Load srt file (try first iso-8859-1 with fallback to utf-8)
    try:
        subtitle = pysrt.open(path=srt_path, encoding='iso-8859-1')
    except Exception:
        try:
            subtitle = pysrt.open(path=srt_path, encoding='utf-8')
        except Exception:
            # If we can't read it, we can't detect, so return
            return None

    # Read first 5 subtitle lines to determine the language
    if len(subtitle) >= 5:
        text = ''
        for sub in subtitle[0:5]:
            text += sub.text

        # Detect the language with highest probability and return it if it's more than the required minimum probability
        detected_languages = langdetect.detect_langs(text)
        log.debug('Detected subtitle language(s): %s', detected_languages)
        if len(detected_languages) > 0:
            # Get first detected language (list is sorted according to probability, highest first)
            detected_language = detected_languages[0]
            language_probability = detected_language.prob
            if language_probability >= autosubliminal.DETECTEDLANGUAGEPROBABILITY:
                log.debug('Probability of detected subtitle language accepted: %s', detected_language)
                return Language.fromietf(detected_language.lang)
            else:
                log.debug('Probability of detected subtitle language too low: %s', detected_language)

    return None
    def return_data(self, **kwargs) -> dict:
        retard_format = detect_langs(kwargs['text'])
        out_dict = {}
        for l in retard_format:
            out_dict[l.lang] = l.prob

        return out_dict
예제 #11
0
def clean_data(inputFile, cutoff=0.95):
    """Drops all empty rows, and initializes a number of counter variables. Uses the langdetect library to generate a
    language code and confidence. This is then split into component parts. If the identifier is 'en' for english, and
    the confidence is above the cutoff (0.95 used to process data), the index of that row is added to a list. Else if
    the labels ISO code is not the same as the detected language and the confidence is above the cutoff, that index is
    also added to the list. A progress counter and timer were added for convenience as the cleaner took a long time to
    run. Once complete, all rows of the corresponding indices were dropped from the table. This dataframe was then
    saved to a csv. Relevant statistics are printed at time of termination."""
    ISOcodes = {'sk': 0, 'fr': 1, 'es': 2, 'de': 3, 'pl': 4}

    df = pd.read_csv(inputFile, encoding="utf8")
    df['text'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text'], inplace=True)
    total = len(df)
    englishCount, misclassifiedCount, count = 0, 0, 0
    hitList = []
    startTime = time()
    for line in df.iterrows():
        label = line[1]["label"]
        text = line[1]["text"]
        try:
            detectedLanguage = detect_langs(text)
            language = str(detectedLanguage[0]).split(":")
            if language[0] == 'en':
                if float(language[1]) > cutoff:
                    englishCount += 1
                    hitList.append(count)
            elif label != ISOcodes[language[0]]:
                if float(language[1]) > cutoff:
                    misclassifiedCount += 1
                    hitList.append(count)
        except:
            pass

        count += 1
        if count % 1000 == 0:
            percentComplete = count * 100 / total
            now = time()
            timeLeft = (1 - count / total) * (
                (now - startTime) / 60) / (count / total)
            timeLeft = str(round(timeLeft, 2)).split(".")
            minutes = timeLeft[0]
            seconds = (float(timeLeft[1]) / 100) * 60
            print("Percent Complete: {}%".format(round(percentComplete, 2)))
            print("Time Left: {}:{:02d}".format(minutes, int(seconds)))
    df.drop(df.index[hitList], inplace=True)

    now = time()
    print("Number of English examples removed: {}".format(englishCount))
    print("Number of misclassified examples removed: {}".format(
        misclassifiedCount))
    print("Number of rows originally in dataframe: {}".format(total))
    print("Percent of training examples classified as English: {}%".format(
        round(englishCount * 100 / total, 2)))
    print("Percent of training examples classified as incorrect: {}%".format(
        round(misclassifiedCount * 100 / total, 2)))
    print("New dataframe length: {}".format(len(df)))
    print("Actual time taken in minutes: {}".format((now - startTime) / 60))

    return df
예제 #12
0
def _detect_message_language(message):
    lang = detect_langs(message)[0]
    if (lang.prob > 0.99):
        lang = lang.lang
    else:
        lang = 'en'
    return languages.get(alpha_2=lang).name
예제 #13
0
def assert_language(text, expected_language):
    langs = detect_langs(text)
    langs = [
        MatchedLang({
            'cs': 'cze',
            'en': 'eng'
        }.get(x.lang), x.prob) for x in langs
    ]

    if not langs or langs[0].prob < 0.50:
        return expected_language  # unable to decide

    if langs[0].lang != expected_language:
        if langs[0].lang is None:  # unknown language detected
            return expected_language
        if len(text) < 15:  # text too short to say
            return expected_language
        # detected but different
        log.warning(
            'Warning: error: language does not match. Expected %s, has %s, langs %s, value %s',
            expected_language, langs[0].lang, langs, text)
    if langs:
        # return the detected language
        return langs[0].lang
    return expected_language
예제 #14
0
def isLang(post: str, targetLang: str) -> bool:
    lang = None
    try:
        lang = str(detect_langs(post)[0])
    except:
        return False
    return targetLang in lang
예제 #15
0
def find_language(text):

    language_list = list(detect_langs(text))

    language_text = "<p>"

    results_list = []
            
    for lang in language_list:
        l, p = str(lang).split(":")

        results_list.append(l)

        language_text += f"<p>{l.upper()}: {round(float(p),2)}</p>"

    '''
    if results_list == ['en', 'fr'] or results_list == ['fr', 'en']:
        language_text += "EN/FR"
    elif results_list == ['fr']:
        language_text += "FR"
    else:
        language_text += "EN"'''

    language_text += "</p>"

    return language_text
예제 #16
0
def detect_songs_language(song_lyrics):
    """
    Takes the lyrics of a song and returns
    the languages that it has and the probabilities
    in a list of tuples
    Args:
        song_lyrics: str
    returns:
        lang_probs = list of tuples (lang, probability)
    """
    try:
        probs = langdetect.detect_langs(song_lyrics)
        lang_probs = list()
        for prob in probs:
            str_lang_prob = str(prob)
            lang_prob = get_lang_probability(str_lang_prob)
            lang_probs.append(lang_prob)

        return lang_probs
    except Exception as e:
        print(e)
        # if error return no english language
        # to delete that particular song
        lang_probs = [("en", 0.9)]
        return lang_probs
예제 #17
0
    def languages_with_examples(self):
        resp = {}

        try:
            for (source, posts) in self.altmetric_api_raw["posts"].iteritems():
                for post in posts:
                    for key in ["title", "summary"]:
                        try:
                            num_words_in_post = len(post[key].split(" "))
                            top_detection = langdetect.detect_langs(post[key])[0]
                            if (num_words_in_post > 7) and (top_detection.prob > 0.90):

                                if top_detection.lang != "en":
                                    language_name = get_language_from_abbreviation(top_detection.lang)
                                    # print u"LANGUAGE:", language_name, top_detection.prob, post[key]

                                    # overwrites.  that's ok, we just want one example
                                    resp[language_name] = post["url"]

                        except langdetect.lang_detect_exception.LangDetectException:
                            pass

        except (KeyError, AttributeError, TypeError):
            pass

        return resp
예제 #18
0
 def identify(
         self, text,
         constrain_to_discussion_locales=SECURE_IDENTIFICATION_LIMIT):
     "Try to identify locale of text. Boost if one of the expected locales."
     if not text:
         return Locale.UNDEFINED, {Locale.UNDEFINED: 1}
     len_nourl = self.strlen_nourl(text)
     if len_nourl < 5:
         return Locale.NON_LINGUISTIC
     expected_locales = set((
         Locale.extract_root_locale(l)
         for l in self.discussion.discussion_locales))
     language_data = detect_langs(text)
     if constrain_to_discussion_locales and (
             len_nourl < constrain_to_discussion_locales):
         data = [(x.prob, x.lang)
                 for x in language_data
                 if Locale.any_compatible(
                     Locale.extract_root_locale(x.lang),
                     expected_locales)]
     else:
         # boost with discussion locales.
         data = [
             (x.prob * (
                 5 if Locale.Locale.extract_root_locale(x.lang)
                 in expected_locales else 1
             ), x.lang) for x in language_data]
     data.sort(reverse=True)
     top = data[0][1] if (data and (data[0][0] > 0.5)
                          ) else Locale.UNDEFINED
     return top, {lang: prob for (prob, lang) in data}
예제 #19
0
def ocr_core(filename):
    im = Image.open(filename)
    im = im.point(lambda p: p > 75 and p + 100)
    text = pytesseract.image_to_string(im)
    lang = str(detect_langs(text)[0])[:2]
    confidence = float(str(detect_langs(text)[0])[3:])
    if lang == 'en':
        while confidence < 0.99:
            im = im.rotate(-90)
            text = pytesseract.image_to_string(im)
            confidence = float(str(detect_langs(text)[0])[3:])
            print('Processing ...')
    else:
        print('No English Detected')

    return text
예제 #20
0
def data_from_CSV():
    conn=establish_DB_connection_SQL()
    df=pd.read_csv(sys.argv[1], keep_default_na=False)
    tweets = df.set_index('status_id',drop=False)
    for index,row in tweets.iterrows():
        phone_no=''
        hashtag=[]
        text=str(row['text'])
        if str(detect_langs(text)[0])[0:2] == 'en' and text!='nan':
            text = data_cleaning(text)
            words=text.split()
            words=set(words)
            words=list(words)
            for w in words:
                if re.match('^((\+){0,1}91(\s){0,1}(\-){0,1}(\s){0,1}){0,1}0{0,1}[1-9]{1}[0-9]{9}$',w):
                    phone_no = w[-10:]
                if w.startswith('#'):
                    hashtag.append(w)
            request_type = processML(text)
            
            if request_type != '':
                name=str(row['screen_name'])
                if name == '': name = None
                if text == '': text = None
                if phone_no == '': phone_no = 9900990099
                latitude = 9.9252
                longitude = 78.1198
                updateDB_SQL(row['screen_name'],str(row['text']),latitude,longitude,phone_no,request_type, conn)
    disconnect_DB_SQL()
예제 #21
0
    def check_language(self, msg, target=None):
        """Check the language of the message.

        Add the result to the metadata and and trigger the
        rule if it is present in the config and the languages
        are not in the ok list.

        :return True if the message language is unwanted and False
        otherwise
        """
        prob = self["textcat_acceptable_prob"]
        results = langdetect.detect_langs(msg.text)
        self.ctxt.log.debug("TextCat results: %s", results)
        langs = [lang.lang for lang in results if lang.prob > prob]
        if len(langs) > self["textcat_max_languages"]:
            self.ctxt.log.debug("Too many languages.")
            return False
        msg.plugin_tags["LANGUAGES"] = " ".join(langs)
        ok_languages = self["ok_languages"]
        if "all" in ok_languages:
            # All good.
            return False
        for lang in langs:
            if lang not in ok_languages:
                return True
        return False
예제 #22
0
def is_lang(text, lang, prob): # print(is_lang("Smth", "ru", 0.75))
    try:
        list_of_languages = detect_langs(text)
        return len(list_of_languages) > 0 and list_of_languages[0].lang == lang and list_of_languages[0].prob > prob
    except Exception:
        print("error in detecting language for text: \"" + text + "\"")
        return False
예제 #23
0
 def detect(self, strict=True):
     try:
         summary = {}
         for i in range(0, TIMES):
             results = detect_langs(self.text)
             for res in results:
                 lang = res.lang
                 prob = float(res.prob)  #float
                 if not lang in summary:
                     summary[lang] = prob
                 else:
                     summary[lang] += prob
         languages = sorted(summary, key=summary.get, reverse=True)
         language = languages[0]
         logger.info("language detection: lang = {} ; summary = {}".format(
             language, summary))
         if strict:
             return language
         else:
             return languages
     except:
         logger.info(
             "failed when detecting language for text: {}\nError: {}".
             format(self.text, traceback.format_exc()))
         return None
예제 #24
0
    def CheckLanguage(self, text):
        # identifier.set_languages(DETECT_LANGUAGES)
        try:
            langs = langdetect.detect_langs(text)
        except UnicodeDecodeError:
            langs = langdetect.detect_langs(text.decode("utf-8"))

        sorted_lang = sorted(langs)
        for lang in langs:
            prob = lang.prob
            lang = lang.lang
            is_very_probable = (prob > 0.50)
            if (is_very_probable):
                return lang

        return None
예제 #25
0
    def languages_with_examples(self):
        resp = {}

        try:
            for (source, posts) in self.altmetric_api_raw["posts"].iteritems():
                for post in posts:
                    for key in ["title", "summary"]:
                        try:
                            num_words_in_post = len(post[key].split(" "))
                            top_detection = langdetect.detect_langs(
                                post[key])[0]
                            if (num_words_in_post > 7) and (top_detection.prob
                                                            > 0.90):

                                if top_detection.lang != "en":
                                    language_name = get_language_from_abbreviation(
                                        top_detection.lang)
                                    # print u"LANGUAGE:", language_name, top_detection.prob, post[key]

                                    # overwrites.  that's ok, we just want one example
                                    resp[language_name] = post["url"]

                        except langdetect.lang_detect_exception.LangDetectException:
                            pass

        except (KeyError, AttributeError, TypeError):
            pass

        return resp
def detect_languages(post_text):
    try:
        langs = detect_langs(post_text)
        lang_dict = dict()
        for lang in langs:
            lang = lang.__repr__()
            lang_dict[lang.split(':')[0]] = float(lang.split(':')[1])
    except:
        lang_dict = dict()

    # Romansh (Rumantsch) cannot be detected by langdetect
    de = lang_dict.get('de', 0)
    fr = lang_dict.get('fr', 0)
    it = lang_dict.get('it', 0)
    en = lang_dict.get('en', 0)
    lang = 'unclassified'
    lang_loading = 0
    if de > lang_loading:
        lang = 'de'
        lang_loading = de
    if fr > lang_loading:
        lang = 'fr'
        lang_loading = fr
    if it > lang_loading:
        lang = 'it'
        lang_loading = it
    if en > lang_loading:
        lang = 'en'
    return lang, de, fr, it, en
예제 #27
0
def guess(string):
    if len(string) < 25:
        # we cannot guess accurately on short strings
        return 'UNKNOWN'

    r = langdetect.detect_langs(string)[0]
    return r.lang if r.prob >= 0.75 else 'UNKNOWN'
예제 #28
0
파일: lang.py 프로젝트: Youngboom/clerk
def find_out_language(candidate_languages, *args):
    candidates = []
    for sample in args:
        candidate = guess_language(sample)
        if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages:
            candidates.append(candidate)
        try:
            for candidate in detect_langs(sample):
                if candidate.lang in candidate_languages:
                    candidates.append(candidate.lang)
        except LangDetectException:
            continue

    if len(candidates) == 0:
        return None
    leading_candidate = {
        'lang': candidates[0],
        'count': candidates.count(candidates[0])
    }
    for leading_candidate in candidates[1:0]:
        if leading_candidate['count'] < candidates.count(candidate):
            leading_candidate['lang'] = candidate
            leading_candidate['size'] = candidates.count(candidate)
    if leading_candidate['lang'] == UNKNOWN_LANGUAGE:
        return None
    return leading_candidate['lang']
예제 #29
0
def tweeter_user_lang_detect(user: str,
                             limit: int = 10,
                             csv_path: str = pj(TWEETS_DIR, TWEETS_FILENAME),
                             delete_csv: bool = True,
                             scrap: bool = True) -> str:
    try:
        if scrap:
            scrap_tweets(user=user, limit=limit, group='')
        tweets = pd.read_csv(csv_path, header=0)
    except:
        with open('failed.txt', 'a') as myfile:
            print('failed')
            myfile.write(user + '\n')
        return (0, 'bug')

    lang_probs = defaultdict(list)

    for tweetLang in chain(
            *[detect_langs(i) for i in preprocess_tweet(tweets.tweet)]):
        lang_probs[tweetLang.lang].append(tweetLang.prob)

    if delete_csv:
        os.remove(csv_path)
        os.remove(csv_path.replace('tweets.csv', 'users.csv'))

    return max((np.mean(v), k) for k, v in lang_probs.items())[1]
예제 #30
0
def which_language(anchor):

    try:
        langs = detect_langs(anchor)
        return str(langs[0])[:2]
    except:  # if not possible to estimate language (like numbers, acronymus etc.) suppose English
        return 'en'
예제 #31
0
def detect_all():
    detector = UniversalDetector()
    results = []
    for file in os.listdir(store_dir):
        if file.endswith(".txt"):
            with open(store_dir + '/' + file, 'r') as myfile:
                data = ''
                for line in myfile.readlines():
                    data += line
                    detector.feed(line)
                    #if detector.done: break
                detector.close()
            # Si se pudo detectar el encoding del fichero
            if detector.result['encoding']:
                print(detect_langs(data.decode(detector.result['encoding'])))
                dlang = detect(data.decode(detector.result['encoding']))
                if dlang == 'fr':
                    print data
                results.append({'mail': file, 'lang': dlang})
            else:
                results.append({'mail': file, 'lang': 'error'})

    stats = {}
    for result in results:
        lang = result['lang']
        if lang in stats:
            stats[lang] = stats[lang] + 1
        else:
            stats[lang] = 1
        if lang != u'en':
            print str(lang) + ' - ' + str(result['mail'])
    return stats
예제 #32
0
def receive_feedback(bot, update):
    feedback_msg = update.message.text
    valid_lang = False
    langdetect.DetectorFactory.seed = 0
    langs = langdetect.detect_langs(feedback_msg)

    for lang in langs:
        if lang.lang in ("en", "zh-tw", "zh-cn"):
            valid_lang = True
            break

    if not valid_lang:
        update.message.reply_text(_("The feedback you sent is not in English or Chinese. Please try again."))
        return 0

    install_lang(update.message.from_user.id)
    update.message.reply_text(_("Thank you for your feedback, I will let my developer know."))

    if is_email_feedback:
        server = smtplib.SMTP(smtp_host)
        server.ehlo()
        server.starttls()
        server.login(dev_email, dev_email_pw)

        text = "Feedback received from %d\n\n%s" % (update.message.from_user.id, update.message.text)
        message = "Subject: %s\n\n%s" % ("Telegram Big Two Bot Feedback", text)
        server.sendmail(dev_email, dev_email, message)
    else:
        logger.info("Feedback received from %d: %s" % (update.message.from_user.id, update.message.text))

    return ConversationHandler.END
예제 #33
0
def fun_loc(arg):
    r2 = False
    loc = arg['location']
    pl = arg['place']
    if re.search(string_loc, loc, re.IGNORECASE) != None or re.search(
            r'(\W|\b)India(\W|\b)|हिन्दुस्तान|भारत', pl,
            re.IGNORECASE) != None:
        #if location is desirable then check if it needs translation

        a = 0
        b = len(arg['text']) - 1
        if arg['display_text_range'] != None:
            a = arg['display_text_range'][0]
            b = arg['display_text_range'][1]

        try:
            d = detect_langs(
                arg['text'][a:b])  #obtain list of possible languages in d
        except:
            return False, None  #if detect_langs() is throwing exception then it means that tweet is useless , it either contains only                                     #simileys, or only a link (a shared tweet) or a language which can't be understood
            #so for such tweets we return false even if location is desireable

        for el in d:  #among detected langs , check if anyone of them matches with desired langs
            if el.lang in tr_lang:
                r2 = True
                break
        return True, r2
    return False, None
예제 #34
0
def update_languages():
    response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag",
                                        "FORMULA")
    label_list, hashtag_list = gspread.convert_to_dict_data(response)

    for index, hashtag in enumerate(hashtag_list):
        name = hashtag['name']
        print(name)

        try:
            detect_list = detect_langs(name)
            languages = [detect.lang for detect in detect_list]
            print(languages)

        except Exception as e:
            print(e)
            continue

        new_data = hashtag_list[index]
        new_data['languages'] = ','.join(languages)
        hashtag_list[index] = new_data

    body = {
        'values': gspread.convert_to_sheet_values(label_list, hashtag_list)
    }
    gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body)
    print("SUCCESS!! update_languages")
예제 #35
0
    def check_language(self, msg, target=None):
        """Check the language of the message.

        Add the result to the metadata and and trigger the
        rule if it is present in the config and the languages
        are not in the ok list.

        :return True if the message language is unwanted and False
        otherwise
        """
        prob = self["textcat_acceptable_prob"]
        results = langdetect.detect_langs(msg.text)
        self.ctxt.log.debug("TextCat results: %s", results)
        langs = [lang.lang for lang in results if lang.prob > prob]
        if len(langs) > self["textcat_max_languages"]:
            self.ctxt.log.debug("Too many languages.")
            return False
        msg.plugin_tags["LANGUAGES"] = " ".join(langs)
        ok_languages = self["ok_languages"]
        if "all" in ok_languages:
            # All good.
            return False
        for lang in langs:
            if lang not in ok_languages:
                return True
        return False
예제 #36
0
def lan_prob(text):
    from langdetect import detect_langs
    try:
        lang_prob = detect_langs(text)
        return lang_prob
    except:
        return 'N/A'
예제 #37
0
def get_language(text):
    """Classify the language of text.

    Uses Google's language detection algorithm to assign a language to
    a text string.

    Parameters
    ----------
    text :str
        The text to classify.

    Returns
    -------
    str
        The most likely language of the text that meets the
        :const:`~PROBABILTY` threshold, among the list of
        supported :const:`~LANGUAGES`. If no supported language meets
        the threshold, returns the value 'none'.

    Notes
    -----
    When associated with a field called "language", the string "none"
    tells MongoDB's text index to use simple tokenization with no list
    of stop words and no stemming. See http://docs.mongodb.org/manual/reference/text-search-languages/
    for more info.

    """
    results = detect_langs(text)
    for result in results:
        if result.lang in LANGUAGES and result.prob >= PROBABILTY:
            return result.lang
    return 'none'
예제 #38
0
def get_html_response_language(response):
    try:
        raw_body = strip_tags(response.body_as_unicode())
        langs = detect_langs(raw_body)
        return (langs[0].lang, langs[-len(langs)+1].lang)
        # return detect(raw_body)
    except Exception,e:
        print str(e)
        return "unknown"
def word_processing(filename):
    doc = docx.Document(filename)
    fullText = []
     
    for para in doc.paragraphs:
        fullText.append(para.text)
        
    langHolder = '\n'.join(fullText)
    return detect_langs(langHolder)
예제 #40
0
def get_word_chords_in_lang(s):
    langs = detect_langs(s)
    if not langs:
        raise Exception("Can't guess lang (langdetect)")
    if langs[0] == 'en':
        return CHORDS_EN + " "
    if langs[0] == 'ru':
        return CHORDS_RU + " "
    print("Lang guess return " + str(langs) + " (but we want 'ru' or 'en' as first)", file=sys.stderr)
    return CHORDS_RU + " "
예제 #41
0
 def detectLang(a):
     det = detect_langs(a)
     # извлечение процентного значения принадлежность строки к английскому языку
     det = [str(i).split(":") for i in det]
     det = [y for z, y in det if z == "en"]
     if det:
         det = float(det[0])
     else:
         det = 0
     return det
예제 #42
0
파일: training.py 프로젝트: chfoo/tellnext
def is_english(text):
    if not only_roman_chars(text):
        return False

    try:
        stats = langdetect.detect_langs(text)
    except LangDetectException:
        return False

    if any(stats.lang == 'en' for stats in stats):
        return True
예제 #43
0
    def discover_language(self, *sentences):
        text =  '. '.join(sentences)
        candidates = {candidate.lang: candidate.prob 
                      for candidate in  detect_langs(text)}

        lang = max(candidates, key=candidates.get)

        delta = candidates.get(lang) - (candidates.get(self.preferred_language) or 0)
        if delta < 0.15:
            return self.preferred_language

        return lang
def ppt_processing(filename):    
    prs = Presentation(filename)
    holder = []    
    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    holder.append(run.text)
                    
    langHolder = '\n'.join(holder)
    return(detect_langs(langHolder))
예제 #45
0
  def is_valid(self, whitelist):
    if not self.title.strip():
      logger.info("has invalid title: " + self.url)
      return False

    if not self.text.strip():
      logger.info("has invalid text: " + self.url)
      return False

    if len(self.text.split(" ")) < 40:
      logger.info("text is too short: " + self.url)
      return False      
      
    if self.normalized_title in whitelist:
      logger.info("is whitelisted: " + self.url)
      return False
    
    possible_langs = detect_langs(self.title) + detect_langs(self.normalized_title)
    if not "en" in map(lambda x: x.lang, possible_langs):
      logger.info("could not detect english language: " + self.url)
      return False

    return True
def excel_processing(filename):
    workbook = xlrd.open_workbook(filename)
    holder = []
    for sheet in workbook.sheet_names():
        current_sheet = workbook.sheet_by_name(sheet)
        numRows = current_sheet.nrows
        numCols = current_sheet.ncols
    #     print numRows, numCols
    
        for row in range(0,numRows):
             for column in range(0,numCols):
                holder.append(current_sheet.cell(row,column).value)
  
    langHolder = ''.join(holder)
    return detect_langs(langHolder)
예제 #47
0
    def prepare_context(self, ctx):
        # if there is no support, put ""

        filter_langs = self.cfg.property("languages").split(",")

        try:
            results = langdetect.detect_langs(ctx["text"])

            if len(results) == 0:
                return

            # apenas as langs configuradas
            results = [result for result in results if result.lang in filter_langs]

            # a maior prob
            best = max(results, key=lambda result: result.prob)

            ctx["language"] = best.lang

        except:
            # da erro se nao tiver nada...
            pass
예제 #48
0
 def identify(self, text, constrain_to_discussion_locales=True):
     "Try to identify locale of text. Boost if one of the expected locales."
     if not text:
         return Locale.UNDEFINED, {Locale.UNDEFINED: 1}
     expected_locales = set((
         Locale.extract_root_locale(l)
         for l in self.discussion.discussion_locales))
     language_data = detect_langs(text)
     if constrain_to_discussion_locales:
         data = [(x.prob, x.lang)
                 for x in language_data
                 if Locale.extract_root_locale(x.lang) in expected_locales]
     else:
         # boost with discussion locales.
         data = [
             (x.prob * (
                 5 if Locale.Locale.extract_root_locale(x.lang)
                 in expected_locales else 1
             ), x.lang) for x in language_data]
     data.sort(reverse=True)
     top = data[0][1] if (data and (data[0][0] > 0.5)
                          ) else Locale.UNDEFINED
     return top, {lang: prob for (prob, lang) in data}
def convert_pdf_to_txt(path, encoding):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = encoding
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return detect_langs(text)
예제 #50
0
						f_text = parsed["content"]
						#print(f_text)
						if f_text != None:
							#print("Found the Text to be None and hence Skipping !")
							fjson["text-count"] = len(f_text)
							#fhandle.close()
							#continue
						f_metadata = parsed["metadata"]
						#print(f_metadata)
						fjson["metadata"] = json.dumps(f_metadata)
						if isinstance(f_metadata,dict):
							fjson["metadata_length"] = len(f_metadata.keys())
							#print("Metadata Fields are: "+str(len(f_metadata.keys())))
						try:
							fjson["languages"] = {}
							languages = detect_langs(f_text)
							for l in languages:
								(lang,probability) = str(l).split(":")
								fjson["languages"][lang] = probability
						except:
							print("\n Language Detection module exncountered error")	
						#print(" Languages Detected {l}".format(l=languages))
						#pp.pprint(fjson["languages"])
					except (KeyError,ValueError):
						print("Tika could not get content for {f}".format(f=fpath))
						fjson["languages"] = " "
					fhandle.close()
					fjson["id"] = fname
					fjson["size"] = os.path.getsize(fpath)
					#print("Size of file : "+str(fjson["size"]))
				except ValueError:
def textFileProcessing(filepath, encoding):
    fileContent = codecs.open(filepath, "r", encoding).read()
    return detect_langs(fileContent)
예제 #52
0
def get_similarity_to_english(text):
    return next((lng.prob for lng in langdetect.detect_langs(text) if lng.lang == 'en'), None)
예제 #53
0
    def detect_languages(self, text):
        """Detect the probabilities for the top languages
        Override this method in the subclass(es).

        """
        return langdetect.detect_langs(text)
예제 #54
0
def detect_language(query):
	print detect_langs(query)
	print detect(query)
	return detect(query)[:2]
def get_similarity_to_english(text):
    if ' ' not in text:
        return 0
    return next((lng for lng in langdetect.detect_langs(text) if lng.lang == 'en'), None)
예제 #56
0
파일: views.py 프로젝트: kibach/saera_web
def search_result(request):
    if not request.method == "POST":
        return redirect('/')

    query = request.POST.get('query')
    q_words = query.split()
    stemmed_words = []
    filters = []
    for word in q_words:
        if word.startswith('domain:'):
            filters.append(('domain', word.replace('domain:', '').lower()))
        elif word.startswith('lang:'):
            filters.append(('lang', word.replace('lang:', '').lower()))
        elif word.startswith('encoding:'):
            filters.append(('encoding', word.replace('encoding:', '').lower()))
        else:
            try:
                lngs = detect_langs(word)
                correct_lng = 'english'
                for lng in lngs:
                    if lng in LANGUAGES and LANGUAGES[lng].lower() in snowballstemmer.algorithms():
                        correct_lng = LANGUAGES[lng].lower()
                stemmed_words.append(snowballstemmer.stemmer(correct_lng).stemWord(word))
            except:
                stemmed_words.append(word)

    doc_ratings = {}

    for word in stemmed_words:
        try:
            stem = Stem.objects.get(stem=word)
        except:
            continue

        term_ratings = {}
        for relation in DocumentStemMap.objects.filter(stem=stem):
            corresponding = True
            for fil in filters:
                if fil[0] == 'domain':
                    if not fil[1] in relation.doc.domain:
                        corresponding = False
                elif fil[0] == 'lang':
                    if not fil[1] == relation.doc.language:
                        corresponding = False
                elif fil[0] == 'encoding':
                    if not fil[1] == relation.doc.encoding:
                        corresponding = False

            if not corresponding:
                continue

            rc = relation.rank_component
            if rc < 0:
                rc = 0
            if relation.doc_id in term_ratings:
                term_ratings[relation.doc_id] += rc
            else:
                term_ratings[relation.doc_id] = rc

        for doc_id in term_ratings:
            term_ratings[doc_id] = term_ratings[doc_id] / (2 + term_ratings[doc_id]) * stem.idf
            if doc_id in doc_ratings:
                doc_ratings[doc_id] += term_ratings[doc_id]
            else:
                doc_ratings[doc_id] = term_ratings[doc_id]

        del term_ratings

    rated_docs = doc_ratings.items()

    rated_docs.sort(key=lambda x: x[1])
    results = []
    for doc_id in rated_docs[:10]:
        results.append(Document.objects.get(id=doc_id[0]))

    return render(request, 'searchres/search_result.html', {
        'documents': results,
        'query': query,
    })
예제 #57
0
def load(limit = None,
    host = "localhost", port = 27017, 
    db = "instagram", media_feed_collection = 'media_feeds', **kwargs):

  # Connect to mongo
  client = MongoClient(host, int(port))
  mongo = client[db][media_feed_collection]

  # Extract the features that we are interested in and put it into a dataframe
  data = {}
  for k in [ 'uid', 'uname', 'mid', 'date', 'text', 'tags', 'tags_count', \
      'likes', 'type', 'locid', 'locname', 'lat', 'long', 'url', 'lang', 'lang_prob' ]:
    data[k] = []

  cnt = 0
  for x in mongo.find():
    if limit is not None and cnt >= int(limit):
      break
    if 'feed' not in x:
      continue

    for y in x['feed'][:30]:
      cnt = cnt + 1
      if cnt % 100 == 0:
        print "%d documents loaded" % cnt
      data['uid'].append(x['_id'])
      data['uname'].append(x['name'] if 'name' in x else "instagram_user")
      data['mid'].append(y['id'])
      data['date'].append(y['created'])
      data['text'].append(y['caption'])
      data['type'].append(y['type'])
      data['tags'].append(" ".join(sorted(y['tags'], key=lambda x: len(x), reverse=True)))
      data['tags_count'].append(len(y['tags']))
      data['likes'].append(y['like_count'])
      data['locid'].append(y['location']['id'] if y['location'] is not None else None)
      data['locname'].append(y['location']['name'] if y['location'] is not None else None)
      data['lat'].append(y['location']['latitude'] if y['location'] is not None else None)
      data['long'].append(y['location']['longitude'] if y['location'] is not None else None)
      data['url'].append(y['images']['standard_resolution'])
      try:
        langs = filter(lambda x: x > 0.2, detect_langs(y['caption'].replace('#', '')))
        data['lang'].append(langs[0].lang)
        data['lang_prob'].append(langs[0].prob)
      except Exception:
        data['lang'].append("??")
        data['lang_prob'].append(0.0)

  client.close()

  df = pd.DataFrame(data)
  if limit is not None:
    df = df[:int(limit)]

  df['text_cleaned'] = [ reduce(lambda y,z: y.replace('#'+z, ''), x['tags'].split(' '), x['text']) \
    for i,x in df.iterrows() ]
  df['text_length'] = [ len(x) for x in df['text_cleaned'] ]
  df['tag_length'] = [ len(x) for x in df['tags'] ] - df['tags_count']
  df['tt_ratio'] = (df['tag_length'].astype(np.float)+1) / (df['text_length']+1)

  print df.describe()
  return df
예제 #58
0
	def langd_englishness(_,text):
		l = langdetect.detect_langs(text)
		if 'en' in l:
			return l['en']
		return 0