def detect_text_language(text): """ Detect language now based on langid (langdetect fails miserably). When classification is below probability_min or text is too short returns undetermined language >>> detect_text_language(u"review with text too small 30") u'und' >>> detect_text_language(u'Mixed language etrange et muy bieno super confusing') u'und' >>> detect_text_language(u"סיפור נפלא ממש. תרגום בסדר פלוס.") u'heb' >>> detect_text_language(u"J'ai adoré ce livre mais il était long") u'fre' >>> detect_text_language(u"CLASSIC BOOKS YOU CAN'T BELIEVE ANYONE WOULD EVER READ EXCEPT FOR SCHOOL OR TO LOOK SMART?") u'eng' """ global MIN_DETECT_SIZE global MIN_DETECT_PROB if not text: return None elif len(text) < MIN_DETECT_SIZE: return u'und' # langid does not work well with all-capitalized text lang_prob = _langidentifier.classify(text.lower()) if lang_prob[1] < MIN_DETECT_PROB: return u'und' else: return brd.get_marc_code(lang_prob[0], capital=False)