示例#1
0
def detect_utterance_langs(reply, history, candidates):
    try:
        c = Detector(reply, quiet=True).language.code
        if c in LANG_LIST:
            return c
        else:
            raise Exception()
    except:
        try:
            txt = " ".join(history) + " " + reply
            c = Detector(txt, quiet=True).language.code
            if c in LANG_LIST:
                return c
            else:
                raise Exception()
        except:
            try:
                txt = " ".join(history) + " " + reply + " " + ' '.join(candidates)
                c = Detector(txt, quiet=True).language.code
                if c in LANG_LIST:
                    return c
                else:
                    return 'unk'
            except:
                return 'unk'
示例#2
0
def language_detection(table_dict, english):
    if english == False:
        for k in list(table_dict):
            tweets = table_dict[k]['tweets'][0:7]
            tweets = (" ".join(tweets))

            try:
                detecor = Detector(tweets)
                conf = detecor.language.confidence
                if conf <= 70.0:
                    table_dict['lang'] = 'en'
                else:
                    table_dict['lang'] = 'non-eng'
            except:
                table_dict['lang'] = 'non-eng'
    else:
        for k in list(table_dict):
            tweets = table_dict[k]['tweets'][0:7]
            tweets = (" ".join(tweets))

            try:
                detecor = Detector(tweets)
                conf = detecor.language.confidence
                if conf <= 70.0:
                    table_dict['lang'] = 'en'

            except:
                continue

    return table_dict
示例#3
0
def determine_text_languages(string):

    input_object = Text(input)

    temp_list = []

    for sentence in input_object.sentences:
        detect = Detector(str(sentence))
        if len(temp_list) == 0:
            temp_list.append([sentence, detect.language.code])
        else:
            if temp_list[-1][1] == detect.language.code:
                temp_list[-1][0] = temp_list[-1][0]+" "+sentence
            else:
                temp_list.append([sentence, detect.language.code])

    new_list= []

    for i in temp_list:
        new_list.append(i[0])

    output = []

    start = 0
    end = 0
    for sentence in new_list:
        detect = Detector(str(sentence))
        end = start + len(sentence)
        position = (start, end)
        start = end + 1
        output.append((detect.language.code, position, detect.language.confidence,))
    return output
示例#4
0
def getWebpageMeanVector(response) -> list:
    metaDescription: str = response.xpath(
        "//meta[@property='og:description']/@content").extract_first()
    if metaDescription:
        metaTitle: str = response.xpath(
            "//meta[@property='og:title']/@content").extract_first()
        if metaTitle:
            webPageTopic: str = metaTitle
        else:
            webPageHeader: str = getPropertyFromHTMLResponse(
                response, "header").strip()
            webPageTitle: str = getPropertyFromHTMLResponse(response,
                                                            "title").strip()
            webPageTopic: str = webPageHeader + ". " + webPageTitle

        return [
            getTextVectors(webPageTopic),
            metaDescription,
            Detector(metaDescription).language.name,
        ]
    else:
        webPageBody: str = getPropertyFromHTMLResponse(response,
                                                       "body").strip()
        webPageHeader: str = getPropertyFromHTMLResponse(response,
                                                         "header").strip()
        webPageTitle: str = getPropertyFromHTMLResponse(response,
                                                        "title").strip()
        wholeWebPageText: str = webPageBody + ". " + webPageHeader + ". " + webPageTitle
        return [
            getTextVectors(wholeWebPageText),
            webPageBody,
            Detector(wholeWebPageText).language.name,
        ]
示例#5
0
    def process_tu(self, tu, num_of_finished_scans):
        src_lang = Detector(tu.src_phrase, quiet=True).language.code
        trg_lang = Detector(tu.trg_phrase, quiet=True).language.code

        if src_lang != self.src_language and src_lang not in self.src_language:
            return [0]
        if trg_lang != self.trg_language and trg_lang not in self.trg_language:
            return [0]
        return [1]
示例#6
0
    def decide(self, tu):
        src_lang = Detector(tu.src_phrase, quiet=True).language.code
        trg_lang = Detector(tu.trg_phrase, quiet=True).language.code
        #		print("PO: ", src_lang + " to " + trg_lang)

        if src_lang != self.src_language and src_lang not in self.src_language:
            return 'reject'
        if trg_lang != self.trg_language and trg_lang not in self.trg_language:
            return 'reject'

        return 'accept'
示例#7
0
    def filter_out_non_english_posts(dataobject):
        """
        given a list of posts, filter in clean monolingual english posts
        :param dataobject: user to posts object
        :return: user to posts clean dictionary
        """
        clean_data = {}
        data = Serialization.load_obj(dataobject)
        for author in data:
            print('processing:', author)
            author_eng_posts = []
            for post in data[author]:
                sentences = []
                for sentence in re.split('\.|\! |\? |\n', post):
                    if len(sentence.split()) < 10: continue
                    try: detector = Detector(sentence)
                    except: continue

                    if detector.languages[0].name == 'English' and \
                            detector.languages[0].confidence > DETECTOR_CONFIDENCE:
                        sentences.append(sentence)
                    # end if
                # end for
                if len(sentences) == 0: continue
                author_eng_posts.append('. '.join(sentences))
            # end for
            if len(author_eng_posts) == 0: continue
            clean_data[author] = author_eng_posts
        # end for

        Serialization.save_obj(clean_data, dataobject+'.clean')
        for author in clean_data:
            print(author, len(clean_data[author]))
示例#8
0
def is_valid_lang_name(lang):
    """
    Return True if given language name in English exists in polyglot library; False otherwise.
    :param lang: str (language name in English)
    :return: bool
    """
    return lang.title() in Detector.supported_languages()
async def get_text_language(page_text):
    try:
        detector = Detector(page_text)
    except UnknownLanguage as exc:
        return "un", exc

    return detector.languages[0].code, None
示例#10
0
    def get_sentiment(self, text):
        try:
            try:
                detector = Detector(text)
                language = detector.language.code
            except UnknownLanguage:
                if (len(text) <= 2):
                    language = 'en'
                else:
                    language = 'google'

            self.language_counter.update({language: 1})
            # print(self.language_counter)

            if (language == 'en'):
                return self.proccess_eng(text)

            if (language == 'google'):
                return self.process_google(text)

            try:
                return self.process_poly(text)
            except ZeroDivisionError:
                return 'N'
            except Exception as e:
                print(e)
                return self.process_google(text)
        except Exception as e:
            print(e)
            return self.proccess_eng(text)
示例#11
0
def find_langs(raw_text, translation=True):
    """

    :param translation: bool
        if we should remove translation posts
    :param raw_text: the raw text from the subreddit
    :return: tuple
        if post is not codeswitch post then return None,
        else return lang1, lang2 and confidence of lang1 in post
    """
    global false_langs
    if "http" in raw_text:
        return None
        # skip posts that have links (these posts are too noisy and hard to built regex to remove the links)
    clean_string = clean_text(raw_text)
    if translation:
        if is_translation(clean_string):
            return None

    detector = Detector(clean_string, quiet=True)
    if ("en" != detector.languages[0].code) and ("en" != detector.languages[1].code):
        # skip posts that don't contain any english
        return None

    if (detector.languages[1].code not in false_langs) and (detector.languages[0].code not in false_langs):
        if detector.reliable:
            lang1 = detector.languages[0].name
            lang2 = detector.languages[1].name
            confidence = detector.languages[1].confidence

            return lang1, lang2, confidence
        else:
            return None
    else:
        return None
示例#12
0
def get_language(x):
    try:
        return Detector(x).language.code
    except UnknownLanguage:
        return None
    except pycld2.error:
        return None
示例#13
0
    def lang_detect(self, task):
        sentence = task['text']
        languages_result = Detector(sentence, True).languages
        detected = {}
        for lang_result in languages_result:
            code = lang_result.code[:2]
            if code in detected:
                detected[code] += lang_result.confidence * \
                    float(lang_result.read_bytes)
            else:
                detected[code] = lang_result.confidence * \
                    float(lang_result.read_bytes)
        detected_MAX = [max(detected, key=detected.get)]
        # Log
        self.logger.info("Detected:%s:%s" % (str(detected_MAX), sentence))
        for lang in languages_result:
            self.logger.info(lang)
        self.logger.info('---')
        # create json format
        json_result = dict()
        json_result['predicted'] = detected_MAX[0]
        json_result['data'] = []
        for ele in languages_result:
            tmp = {}
            tmp_ele = str(ele).replace("  ", " ").split(" ")
            tmp_ele = [item for item in tmp_ele if item != '']
            #
            tmp['name'] = tmp_ele[1]
            tmp['code'] = tmp_ele[3][:2]
            tmp['score'] = float(detected[tmp['code']])
            tmp['bytes'] = tmp_ele[8]
            json_result['data'].append(tmp)

        return json.dumps(json_result)
示例#14
0
 def lang_detect(self, text, threshold=0.9):
     detector = Detector(text, quiet=True)
     if detector.language.confidence > threshold:
         return detector.language.code
     else:
         raise LanguageNotRecognisedError(
             'Could not recognize the language')
def detect_lang(text,
                print_error=False,
                raise_error=False,
                keep_unreliable=False):
    from polyglot.detect import Detector
    """
  For detecting language using polyglot, but with exception handling

  Examples:
  >>> detect_lang("This is a test text")
  ('en', 95.0)
  >>> detect_lang(text = "Dette er åbenbart en norsk tekst", keep_unreliable = True)
  ('no', 97.0)
  >>> detect_lang(text = "Dette er åbenbart en norsk tekst. This is also an english text.", keep_unreliable = True)
  """
    text = str(text)
    try:
        detector = Detector(text, quiet=True)
        if detector.reliable or keep_unreliable:
            lang = detector.language
            return lang.code, lang.confidence
    except Exception as e:
        if print_error and not raise_error:
            print(e)
        if raise_error:
            raise Exception(e)
    return np.nan, np.nan
def langDetect(content):
    global languageDetected
    printmessage("Detecting languages")    
    langDict = {}        
    try:
        languages = Detector(content).languages
        for x in range(0, len(languages)):
            code = languages[x].code
            if code != 'un':
                confidence = languages[x].confidence
                langDict[code]=confidence
                
    except Exception as e:
        printmessage("Language detection error -->"+str(e))          
        languages = langid.classify(content)
        #printmessage(languages)
        code = languages[0]
        confidence = abs(languages[1])
        if confidence > 99:
            confidence = 99
        else:
            confidence = round(confidence, 0)
        langDict[code]=confidence
    printmessage(langDict)        
    return langDict
示例#17
0
def detect_langs(text):
    try:
        c = Detector(text, quiet=True).language.code
        if c in LANG_LIST:
            return c
    except:
        return "unk"
示例#18
0
def get_valid_videos(tmp_dir: str, lang_code: str) -> list:
    valid_videos = []
    for tmp_file in os.listdir(tmp_dir):
        tmp_file_path = os.path.join(tmp_dir, tmp_file)

        if not os.path.isfile(tmp_file_path):
            logging.error('File {tmp_file} does not exist')
            continue

        with open(tmp_file_path) as tmp_json_file:
            try:
                metadata = json.load(tmp_json_file)
            except ValueError:
                logging.error('Failed to decode json from file {tmp_file}')

                delete_file(tmp_file_path)

                continue

        text = metadata['title'].strip() + ' ' + \
            metadata['description'].strip()

        # Remove non printable characters/symbols, which sometimes cause errors in Polyglot Detecor
        printable_str = ''.join(x for x in text if x.isprintable())

        detector = Detector(printable_str, quiet=True)

        if detector.language.code == lang_code:
            valid_videos.append(metadata['id'])

        delete_file(tmp_file_path)

    return valid_videos
def get_language(reader):
    """
    Gets language of descriptions.
    :param reader: CSV file content
    :return: string
    """
    desc = [x[4] for x in reader]
    text = ' '.join(desc)
    try:
        lang = Detector(text).language.name
        if lang == "un":
            return test_for_language(desc)
        else:
            return lang.lower()
    except:
        return test_for_language(desc)
示例#20
0
def detect_lang(text1: str, name: Union[bool, int] = False) -> str:
    """
    return name.lower() if name is True

    Detect Chinese and other languages using polyglot.
    """

    if not text1.strip():
        detected = "en"
        if name:
            detected = "english"
    else:
        try:
            # detected = Detector(text1).languages[0].code
            _ = Detector(text1).language
            if name:
                detected = _.name.lower()
            else:
                detected = _.code
        except Exception as exc:
            # LOGGER.debug(" langid.classify failed: %s", exc)
            LOGGER.debug(
                " Detector(text1).language[0] failed: %s, setting to 'en'/'english' ",
                exc,
            )
            if name:
                detected = "english"
            else:
                detected = "en"

    return detected
def recognition_language(line):
    try:
        language = Detector(line)
    except:
        print(line)
    else:
        return language.language.name
示例#22
0
def detect():
    query = request.json['text']

    detector = Detector(query, quiet=True)
    locl = detector.language.locale.getName().replace('_', '-')
    conf = detector.language.confidence
    read = detector.language.read_bytes

    parsed = []
    try:
        blob = Text(query)
        for entity in blob.entities:
            eobj = {}
            eobj['tag'] = entity.tag
            eobj['entity'] = entity
            parsed.append(eobj)
    except:
        pass

    return {
        "locale": locl,
        "confidence": conf,
        "read_bytes": read,
        "entities": parsed
    }
示例#23
0
def process_text(text_data):
	text_data = ''.join(x for x in text_data if x.isprintable())
	text_data = text_data.replace("#", " ")
	text_data = text_data.replace("\n", " ")
	languages = Detector(text_data, quiet=True).languages

	word_list = []
	if languages[0].code in ["ko"]:
		tokens = okt.pos(text_data)
		#print(tokens)
		for token in tokens:
			word = token[0]
			if token[1] in ['Foreign', 'Number', 'URL', 'Email', 'ScreenName', 'Hashtag']:
				# all Hashtag remaining are Japanese
				continue
			elif token[1] == 'Alpha':
				word = word.lower()
			if word == '그램':
				if len(word_list) > 0:
					if word_list[-1] == '스타':
						word_list[-1] = '스타그램'
					elif word_list[-1] == '맛스타':
						word_list[-1] = '맛스타그램'
					else:
						word_list.append(word)
				else:
					word_list.append(word)
			else:
				word_list.append(word)		
	return word_list
示例#24
0
    def detect_lang(self):
        result = []
        for line in self.data:
            lang = Detector(line, quiet=True).language.name
            result.append([line, lang])

        self.save(result)
示例#25
0
def detect():

    json = request.get_json()
    text = json['text']

    result = {'result': False}
    try:
        detector = Detector(text)
        lc = detector.language.code
        fixCode = lc
        if languageCodes.get(lc):
            fixCode = languageCodes[lc]

        result = {
            'result': True,
            'text': text,
            'language': detector.language.name,
            'code': fixCode
        }
    except Exception as e:
        print(e)
    finally:
        pass

    return jsonify(result)
def language_detection(input_keyword):
    language_list = ["English"]

    #text=Text(input_keyword)
    input_keyword = input_keyword.lower()
    detector = Detector(input_keyword, quiet=True)
    detected_lang = detector.language.name
    confidence = detector.language.confidence
    keyword = keyword_lemmatization(input_keyword)

    try:
        if detected_lang in language_list:
            keyword = keyword_lemmatization(input_keyword)
            print("Root of Input keyword is {}".format(keyword))
        elif detected_lang == "un":
            raise Exception(
                "Sorry !! can not detected Language for {} BNG Model Can only work for English Keyword"
                .format(input_keyword))
        elif detected_lang not in language_list:
            raise Exception(
                "Detected Language is {} BNG Model Can only work for English Keyword"
                .format(detected_lang))
    except Exception as e:
        #logging.error(e)
        print(e)
        login_function(e, "warning.log")
    return detected_lang, confidence, keyword
示例#27
0
 def detectLanguage(self, text):
     try:
         detectionResult = Detector(text)
         lang = detectionResult.language.code
     except:
         lang = "un"
     return lang
示例#28
0
def save_english_by_paragraph(filename, filename_out):
    """Processes the original corpus, collapses paragraphs into a single string and saves english paragraphs

    Args:
        filename: path to the original corpus
        filename_out: path where to store the english language corpus

    Returns:

    """
    try:
        from polyglot.detect import Detector
        from polyglot.detect.base import UnknownLanguage

        with open(filename) as fin, open(filename_out, 'w') as fout:
            paragraph = ''
            for line in fin:
                line = line.strip()
                if line:
                    paragraph += ' ' + line
                else:
                    if paragraph:
                        try:
                            la = Detector(paragraph, quiet=True).language.code
                        except UnknownLanguage:
                            la = 'un'
                        if la == 'en':
                            fout.write('{}\n'.format(preprocess(paragraph)))
                            paragraph = ''
    except ImportError:
        print('Error: polyglot has not been installed')
        print('to install polyglot:')
        print('install icu4c - see instruction in the readme file')
        print('pip install polyglot')
def detect_language(tweet):
    # tweet = filter(lambda x: x in string.printable, tweet)
    # is_utf8 = isUTF8Strict(tweet)
    # if is_utf8:
    # tweet = tweet.decode('utf-8')
    try:
        print(tweet)
        languages = Detector(tweet, quiet=True).languages
        is_english = False
        max_confidence = 0

        for language in languages:
            if language.name == "English":
                max_confidence = language.confidence
                print(language.confidence)
                if float(language.confidence) >= 93.0:
                    is_english = True
                else:
                    is_english = False
            else:
                if float(language.confidence) >= 10.0:
                    is_english = False
        return is_english
    except UnicodeDecodeError:
        print("UnicodeDecodeError ew")
        return False
示例#30
0
    def find_error(source):
        delete_list = []
        t0 = time.time()
        found = 0
        proc_name = multiprocessing.current_process().name
        #print "@@@@@@@@@@@@@@@@@@@@@"*4
        #print "Current process:%s" % proc_name

        for runner, two in enumerate(source):
            index = two[0]
            sentence = two[1]
            if runner % 1000 == 0:
                t1 = time.time()
                sys.stdout.write("PROC:" + proc_name + ",Line:" + str(runner) +
                                 ",Time Cost:" + str(1000.0 / (t1 - t0)) + "lines/s\r")
                t0 = t1
                sys.stdout.flush()
            tmp_s = sentence.decode('utf8')
            #
            #detected = Detector(tmp_s).language.code[:2]
            #print detected
            try:
                detected = Detector(tmp_s).language.code[:2]
                if detected == t_lang:
                    #print "INDEX:%d,%s" % (index,sentence)
                    #print detected
                    delete_list.append(index)
                    found += 1
            except BaseException:
                pass

        #print "%%%%%%%%%%%%%%%%%%%%%%%%"
        #print "Found ERROR sentences : %d" % found

        return delete_list, found