Пример #1
0
    def __init__(self, lng_list=None):
        if lng_list is None:
            langid.set_languages(['en', 'ru'])
        else:
            langid.set_languages(lng_list)

        self.logger = logging.getLogger(__name__)
Пример #2
0
    def language_filter(self):
        '''
        Discarding sentence pairs whose detected languages are not it-de.
        Workaround for unsolved issue of langid (doesn't work with segments with only UPPERCASE characters):
        if all uppercase, convert to lowercase, then run langid.
        '''
        tree = self.tree
        print("Removing TUs with wrong detected language...")
        langid.set_languages(['de', 'it'])
        nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
        root = tree.getroot()
        body = root.find("body")
        counter = 0  # for testing purposes

        for tu in root.iter("tu"):
            source_segment = tu.find(f"./tuv[@xml:lang='{LANG_PAIR[0]}']/seg", namespaces=nsmap).text
            target_segment = tu.find(f"./tuv[@xml:lang='{LANG_PAIR[1]}']/seg", namespaces=nsmap).text

            if source_segment.isupper() and target_segment.isupper():
                source_segment = source_segment.lower()
                target_segment = target_segment.lower()

            try:
                detect_it = langid.classify(source_segment)
                detect_de = langid.classify(target_segment)
            except:
                continue

            if "de" in detect_it or "it" in detect_de:    # broader alternative (every language other than it or de) => if "it" not in detect_it or "de" not in detect_de:
                if len(source_segment.split()) > 8 and len(target_segment.split()) > 8: # just considering longer segments, shorter ones are more likely to be false positives
                    #print("\n\n", detect_it, detect_de, "\t", source_segment, "\n\t\t", target_segment)
                    body.remove(tu)
                    counter += 1

        print("%i TUs removed" % counter)
Пример #3
0
    def setup(self):
        ## subscribe to all receivers
        self.subscribeToAll()
        """
        ## or pick which ones
        for k in self.allReceivers.keys():
            self.subscribeTo(k)
        ## or subscribe to osc
            self.subscribeTo('osc')
        """
        ## some variables
        self.queueDelay = 1
        self.lastQueueCheck = time.time()

        ## turn up the volume
        subprocess.call("amixer set PCM -- -100", shell=True)

        ## for language identification
        langid.set_languages(['en','es'])

        ## for tagging
        input = open('uniTag.en.pkl', 'rb')
        self.enTagger = load(input)
        input.close()
        input = open('uniTag.es.pkl', 'rb')
        self.esTagger = load(input)
        input.close()
        self.tagDict = {}
Пример #4
0
    def __init__(self,
                 preferred_languages=None,
                 preferred_factor=80.0,
                 engine_preference=None,
                 **config):
        self.preferred_languages = preferred_languages or []
        self.preferred_factor = preferred_factor
        engine_preference = engine_preference or enumerate_engines()
        for ename in enumerate_engines():
            if ename not in engine_preference:
                engine_preference.append(ename)

        self.engines = []
        self.languages = set()

        for ename in engine_preference:
            try:
                options = config.get(ename, {}).get('options', {})
                defaults = config.get(ename, {}).get('defaults', {})
                eng = create_engine(ename, options=options, defaults=defaults)
                self.engines.append(eng)

                languages = config.get(ename, {}).get('languages', {})
                for lang, conf in languages.items():
                    eng.configure(language=lang, **conf)
            except TTSError:
                pass

        for eng in self.engines:
            self.languages.update(eng.languages.keys())

        langid.set_languages(self.languages)

        if not self.languages:
            raise TTSError('No supported languages')
Пример #5
0
def main(source,
         target,
         scores,
         output,
         from_language='de',
         to_language='en',
         num_threads=1,
         min_sent_length=3,
         max_sent_len_diff=15,
         number_url_ratio_threshold=0.6):

    langid.set_languages([from_language, to_language])

    fr_stopwords = set(ds.stopwords.words(ds.lang_map[from_language]))
    to_stopwords = set(ds.stopwords.words(ds.lang_map[to_language]))

    with open(output, 'w') as fout, tqdm() as pbar:
        wc.server(helper,
                  writer,
                  reader(source, target, scores),
                  num_threads=num_threads,
                  output_stream=fout,
                  fr_stopwords=fr_stopwords,
                  to_stopwords=to_stopwords,
                  pbar=pbar,
                  min_sent_length=min_sent_length,
                  max_sent_len_diff=max_sent_len_diff,
                  number_url_ratio_threshold=number_url_ratio_threshold,
                  from_language=from_language,
                  to_language=to_language)
Пример #6
0
def split_by_language(reviews):
    """
    Split the reviews based on their language.
    input arguments:
        reviews: a list of review items
    output arguments:
        reviews_dict_languages: a dictionary with languages as keys, 
                                and a list of the corresponding reviews as value.
    """

    # Initialization
    reviews_dict_languages = {}
    langid.set_languages(language_list)

    # Use a counter to visualize the progress
    count = 1

    # Loop over all reviews
    for review in reviews:

        # Detect the language
        language = langid.classify(review.content)[0]

        #Store the review in the corresponding dictionary by language
        if language in reviews_dict_languages:
            reviews_dict_languages[language].append(review)
        else:
            reviews_dict_languages[language] = []
            reviews_dict_languages[language].append(review)

    return reviews_dict_languages
Пример #7
0
    def __init__(self, preferred_languages=None, preferred_factor=80.0, engine_preference=None, **config):
        self.preferred_languages = preferred_languages or []
        self.preferred_factor = preferred_factor
        engine_preference = engine_preference or enumerate_engines()
        for ename in enumerate_engines():
            if ename not in engine_preference:
                engine_preference.append(ename)

        self.engines = []
        self.languages = set()

        for ename in engine_preference:
            try:
                options = config.get(ename, {}).get('options', {})
                defaults = config.get(ename, {}).get('defaults', {})
                eng = create_engine(ename, options=options, defaults=defaults)
                self.engines.append(eng)

                languages = config.get(ename, {}).get('languages', {})
                for lang, conf in languages.items():
                    eng.configure(language=lang, **conf)
            except TTSError:
                pass

        for eng in self.engines:
            self.languages.update(eng.languages.keys())

        langid.set_languages(self.languages)

        if not self.languages:
            raise TTSError('No supported languages')
def classify_lang_langid(text):
    import langid
    # constrain the language set
    langid.set_languages(['en', 'zh', 'ru', 'ja', 'ko'])

    # e.g. ('zh', -19.244097471237183)
    return langid.classify(text)
Пример #9
0
def voice_things(local_voicedb, args):

    langs = args.langs or []

    tts_func = None
    if args.engine:
        tts_engine = ENGINE_MAP[args.engine]
        tts_func = tts_engine.get_tts_func(args)

        if not langs:
            langs = tts_engine.legal_langs

    lang_map = {}
    for _lang in langs:
        lang_map[_lang[:2]] = _lang

    languages_to_set = [lang_id_code for lang_id_code in lang_map.keys()]

    langid.set_languages(languages_to_set)

    def classify_tts_lang(text):
        langid_code = langid.classify(text)[0]

        if 'zh' in languages_to_set:
            langid_code = fix_zh(langid_code, text)

        tts_lang = lang_map[langid_code]

        return tts_lang

    def local_voice_path(text, lang):
        tts_lang = lang

        log('try local voice: {}, {}'.format(repr(lang), repr(text)))

        voice_path = local_voicedb.get_path(text, tts_lang)
        if not voice_path and callable(tts_func):  # The voice not in local
            log('not in local.', )
            log('try TTS engine {} ... '.format(args.engine))
            voice_data = tts_func(text, tts_lang)
            log('done!')

            tmp_file = tempfile.NamedTemporaryFile(delete=False)
            tmp_file_name = tmp_file.name
            tmp_file.close()

            with open(tmp_file_name, 'wb') as f:
                f.write(voice_data)

            local_voicedb.add_voice(tmp_file_name, text, tts_lang)

            os.remove(tmp_file_name)

            voice_path = local_voicedb.get_path(text, tts_lang)

        return voice_path

    return classify_tts_lang, local_voice_path
Пример #10
0
def voice_things(local_voicedb, args):

    langs = args.langs or []

    tts_func = None
    if args.engine:
        tts_engine = ENGINE_MAP[args.engine]
        tts_func = tts_engine.get_tts_func(args)

        if not langs:
            langs = tts_engine.legal_langs

    lang_map = {}
    for _lang in langs:
        lang_map[_lang[:2]] = _lang

    languages_to_set = [lang_id_code for lang_id_code in lang_map.keys()]

    langid.set_languages(languages_to_set)

    def classify_tts_lang(text):
        langid_code = langid.classify(text)[0]

        if 'zh' in languages_to_set:
            langid_code = fix_zh(langid_code, text)

        tts_lang = lang_map[langid_code]

        return tts_lang

    def local_voice_path(text, lang):
        tts_lang = lang

        log('try local voice: {}, {}'.format(repr(lang), repr(text)))

        voice_path = local_voicedb.get_path(text, tts_lang)
        if not voice_path and callable(tts_func):  # The voice not in local
            log('not in local.', )
            log('try TTS engine {} ... '.format(args.engine))
            voice_data = tts_func(text, tts_lang)
            log('done!')

            tmp_file = tempfile.NamedTemporaryFile(delete=False)
            tmp_file_name = tmp_file.name
            tmp_file.close()

            with open(tmp_file_name, 'wb') as f:
                f.write(voice_data)

            local_voicedb.add_voice(tmp_file_name, text, tts_lang)

            os.remove(tmp_file_name)

            voice_path = local_voicedb.get_path(text, tts_lang)

        return voice_path

    return classify_tts_lang, local_voice_path
Пример #11
0
def langid_classifier(texts, lowercase=True, langs=['en', 'fr']):

    langid.set_languages(langs)
    if lowercase:
        detected_langs = [langid.classify(str(text).lower())[0] for text in texts]
    else:
        detected_langs = [langid.classify(str(text))[0] for text in texts]

    return detected_langs
Пример #12
0
def get_langid(str_text):
  """
  When needed: identify language
  """
  dic_lg = {	"el":"Greek", "en":"English", "pl":"Polish", 
		"ru":"Russian", "zh": "Chinese"}#TODO; more generic
  langid.set_languages(list(dic_lg.keys()))
  lang = dic_lg[langid.classify(str_text)[0]]
  return lang
Пример #13
0
 def __init__(self, init_langid=True, line_postprocessor=None):
     ContentHandler.__init__(self)
     self.currentTag = None
     self.series = []
     self.item = None
     if init_langid:
         langid.set_languages(langs=['en', 'ru', 'uk'])
     if line_postprocessor == None:
         self.line_postprocessor = RePostprocessor().process
     else:
         self.line_postprocessor = line_postprocessor
Пример #14
0
 def __init__(self):
     self.reqparse = reqparse.RequestParser()
     self.reqparse.add_argument('content', type=unicode, location='json')
     self.reqparse.add_argument('lang', location='json')
     self.reqparse.add_argument('id', location='json')
     self.reqparse.add_argument('date', location='json')
     self.mordecai_ip = app.config['MORDECAI_ADDR']
     self.mordecai_port = app.config['MORDECAI_PORT']
     self.hypnos_ip = app.config['HYPNOS_ADDR']
     self.hypnos_port = app.config['HYPNOS_PORT']
     langid.set_languages(['ar', 'en'])
     super(HermesAPI, self).__init__()
Пример #15
0
def detect_language(texts, corpus_name="en_core_web_lg", langs=['en', 'fr']):
    """Detect the language of a list of texts.
        return detected languages"""

    # nlp = spacy.load(corpus_name)
    # nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    # detected_langs = [nlp(str(text))._.language['language'] for text in texts] # convert text into lowercase.

    langid.set_languages(langs)
    detected_langs = [langid.classify(text.lower())[0] for text in texts]

    return detected_langs
Пример #16
0
def extract(text: str, verbose: bool = False) -> datetime:
    langid.set_languages(['en', 'ru'])
    page = BeautifulSoup(text, "lxml")

    if verbose:
        print("Run site checkers...")
    _ = _check_sites(page, verbose)
    _date = _extract_of_site(page, _) if _ is not None else None
    if _date is not None:
        return _date

    if verbose:
        print("Extract tags from page")
        print('=' * 20)
        print(' ' * 20)
        print(' ' * 20)

    _date_lines = _extract_date_tags(page, TAGS_PATH, verbose)
    shuffle(_date_lines)

    if verbose:
        print(_date_lines)

    _date, _day = _extract_date_with_regex(_date_lines)
    if _date is not None:
        return _date

    for x in _date_lines:
        _date = reg_exp_datetime(x)
        if _date is not None:
            return _date

    _cal = Calendar(Constants("en"))

    funcs = [
        _cal.parseDateText,
        _cal.parse,
        _cal.parseDT,
    ]

    key_minutes = 'minutes_hour'
    data = [_get_date_info(key_minutes, x, verbose) for x in _date_lines]
    result = _create_datetime(*_extract_day_time(data, key_minutes, [x.__name__ for x in funcs], verbose))

    if verbose:
        print(' ' * 20)
        print(' ' * 20)
        print('=' * 20)

    return result
Пример #17
0
def eng_log_prob(directory='cleaned_paraphrases',
                 sentences_file='sentences_200k.pkl',
                 paras_file='paraphrases_all.pkl'):
    sentences = load_pkl(f'{directory}/{sentences_file}')
    all_para = load_pkl(f'{directory}/{paras_file}')

    langid.set_languages(['en'])

    eng_log_prob_sents, eng_log_prob_paras = [], []
    for sent, paras in tqdm(zip(sentences, all_para)):
        eng_log_prob_sents.append(langid.classify(sent)[1])
        eng_log_prob_paras.append([langid.classify(para)[1] for para in paras])

    save_pkl(eng_log_prob_sents, 'stats/eng_log_prob_sents.pkl')
    save_pkl(eng_log_prob_paras, 'stats/eng_log_prob_paras.pkl')
Пример #18
0
def detect_language(text, langset=None) -> tuple:
    """Detect the language of the text. Return (lang, error).

    args:
    "langset" is the set of languages that the result should be limited to.

    return:
    "lang" will be a string containing an ISO 639-1 code.
    "error" will be an integer indicating a percentage. (Rounded to 2 digits)
    """
    if langset:
        langid.set_languages(langset)
    language, confidence = langid.classify(text)
    error = round((1 - confidence) * 100, 2)
    return language, error
Пример #19
0
def detect_language(text, langset={}):
    """Detect the language of the text. Return (lang, error).

    args:
    "langset" is the set of languages that the result should be limited to.

    return:
    "lang" will be a string containing an ISO 639-1 code.
    "error" will be an integer indicating a percentage. (Rounded to 2 digits)
    """
    if langset:
        langid.set_languages(langset)
    lang, confidence = langid.classify(text)
    error = round((1 - confidence) * 100, 2)
    return lang, error
    def language_detect(self, fast=True, possible_languages=['es', 'en']):
        ''''
        Language identification of the texts:
        This process is more precise when using TextBlob. If fast=True, the language
        detection will be done using the langid module, which is faster but more imprecise
        '''

        if fast:
            langid.set_languages(possible_languages)
            self.data['language'] = self.corpus[self.text].map(
                lambda x: langid.classify(x)[0])
            return self.data['language']
        else:
            self.data['language'] = self.corpus[self.text].map(
                lambda x: TextBlob(x).detect_language())
            return self.data['language']
Пример #21
0
 def language_identify_thres (self, msgs, lang_list, thres):
     nw = []
     tally = 0
     list_removed = []
     for post in msgs: 
         langid.set_languages(lang_list)
         out = langid.classify (post)
         out2 = list(out)
         if out2[0]=='en': 
             nw.append(post)
         elif out2[1] > thres:
             nw.append(post)
         else: 
             tally += 1 
             list_removed.append(tuple ([post, out2[0], out2[1]]))
     return nw, tally, list_removed   
Пример #22
0
def get_body(soup):
    body_list = []
    find_body = soup.find(class_="documentoesteso").find_all(
        re.compile("(p(?!a)|li|h2)"))  # scraping for the body
    if find_body is not None:
        for p in find_body:
            for br in p.find_all("br"):
                br.replace_with("\n")  # replacing </br> tags with newline
            body_line = p.get_text().strip(
            )  # getting text and stripping whitespaces
            if body_line:
                body_list.append(body_line)
    body_len = len(body_list)
    body = "\n".join(body_list)
    langid.set_languages(['de', 'it'])
    detected_lang = langid.classify(body)  # detecting text language
    return body, detected_lang, body_len
Пример #23
0
def outputData(folder_name, comments, shares, status):
	langid.set_languages(['en','es'])

	output_array = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	for entry in comments:
		output_array[langCheck(entry["text"])] += 1

	for entry in shares:
		output_array[langCheck(entry["owner_comment"]) + 4] += 1

	for entry in status:
		output_array[langCheck(entry["message"]) + 8] += 1

	output_string = folder_name
	for i in output_array:
		output_string += "," + str(i)

	print output_string
Пример #24
0
def get_langid_predictions(X):
    pred = []
    print('Request started-Langid')
    langid.set_languages(['bg','es','ms','sr','bs','hr','pt','cs','id','mk','sk'])
    i = 0
    for text in X:
        try:
            i = i + 1
            (lang, confidence) = langid.classify(text) 
            pred.append(lang)
            print('lang:', i, lang, text)
            if (i%50 == 0):
                print(i)
        except Exception as inst:
            print('Exception', i,inst, response)

    print('Request ended')
    return pred
Пример #25
0
def language_detection(text):
    language_set = ['en', 'fr', 'es', 'nl', 'it', 'pl', 'ro', 'pt']
    langid.set_languages(['en', 'fr', 'es', 'nl', 'it', 'pl', 'ro', 'pt'])
    text = re.sub(r'[^\w\s]', '', text).strip()
    text = text.replace('\n', ' ')
    if text.strip():
        try:
            lang = language_model.predict_lang(text)
            if lang in language_set:
                return lang
            else:
                return classify(text)[0]
        except:
            return classify(text)[0]
        # with GoogleTranslate(text) as output:
        #     lang = output["language"]
        #     if lang in language_set:
        #         return lang
    return classify(text)[0]
Пример #26
0
def detect_lang(text_sequence, _type="whole"):
    # Only import langid if required.
    import langid
    langid.set_languages(
        ['ml', 'ta', 'bn', 'ur', 'hi', 'en', 'te', 'gu', 'pa', 'mr', 'or'])

    def _detect_segmented(text_sequence):
        warnings.warn("Detect segmented is not recommended."
                      "This might lead to large slowdowns.")
        tokens = text_sequence.split()
        lang_assignments = []
        for token in tokens:
            lang, prob = langid.classify(token)
            lang_assignments.append(lang)

        prev = None
        d_idxs = []
        for i, lang in enumerate(lang_assignments):
            if lang != prev:
                d_idxs.append(i)
                prev = lang

        d_idxs.append(len(tokens))

        ranges = zip(d_idxs, d_idxs[1:])
        export = []
        for l, u in ranges:
            segment = ' '.join(tokens[l:u])
            tpl = (segment, lang_assignments[l])
            export.append(tpl)

        return export

    def _detect_whole(text_sequence):
        lang, prob = langid.classify(text_sequence)
        return [(text_sequence, lang)]

    switch = {"whole": _detect_whole, "segmented": _detect_segmented}

    if _type not in switch:
        warnings.warn("Unknown type {}, defaulting to whole".format(_type))

    return switch.get(_type, "whole")(text_sequence)
Пример #27
0
def sentiment_analysis(data):
    langid.set_languages(['en', 'ru'])
    lang = langid.classify(data['text'][0])[0]
    if lang == 'ru':
        labels = data['from'].unique()
        msg_df = data.loc[data.text != '']
        messages_1 = list(msg_df.text[msg_df['from'] == labels[0]])
        messages_2 = list(msg_df.text[msg_df['from'] == labels[1]])

        tokenizer = RegexTokenizer()

        model = FastTextSocialNetworkModel(tokenizer=tokenizer)

        results_1 = model.predict(messages_1, k=2)
        sentiments_1 = []

        for sentiment in results_1:
            # привет -> {'speech': 1.0000100135803223, 'skip': 0.0020607432816177607}
            # люблю тебя!! -> {'positive': 0.9886782765388489, 'skip': 0.005394937004894018}
            # малолетние дебилы -> {'negative': 0.9525841474533081, 'neutral': 0.13661839067935944}]

            tone = 0
            if 'positive' in sentiment:
                tone += sentiment['positive']
            if 'negative' in sentiment:
                tone -= sentiment['negative']
            sentiments_1.append(tone)

        results_2 = model.predict(messages_2, k=2)
        sentiments_2 = []

        for sentiment in results_2:

            tone = 0
            if 'positive' in sentiment:
                tone += sentiment['positive']
            if 'negative' in sentiment:
                tone -= sentiment['negative']
            sentiments_2.append(tone)

        return sentiments_1, sentiments_2
Пример #28
0
 def __init__(self):
     self.reqparse = reqparse.RequestParser()
     self.reqparse.add_argument("content", type=unicode, location="json")
     self.reqparse.add_argument("lang", location="json")
     langid.set_languages(["ar", "en"])
     super(HermesAPI, self).__init__()
Пример #29
0
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer

from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.cache import cache

from .models import Profile, Message

app = Celery('chat')

channel_layer = get_channel_layer()
User = get_user_model()

chatbot = ChatBot(**settings.CHATTERBOT)
langid.set_languages([code for code, _ in settings.LANGUAGES])


@app.task
def update_user_statuses():
    """ Task to update user online statuses via websockets. """
    # Chat bot is always online.
    now = datetime.datetime.now()
    cache.set('seen_chatbot', now, settings.USER_ONLINE_TIMEOUT)

    async_to_sync(channel_layer.group_send)(
        'users',
        {
            'type': 'users.update',
            'content': Profile.get_online_users()
        }
Пример #30
0
                        '--lang1',
                        help='source language',
                        dest='source_lang',
                        default='en')
    parser.add_argument('-t',
                        '--lang2',
                        help='target language',
                        dest='target_lang',
                        default='fr')
    args = parser.parse_args()

    deletions = defaultdict(list)

    endCount = 0
    totalCount = 0
    langid.set_languages([args.source_lang, args.target_lang])
    for line in args.infile:
        totalCount += 1
        [url1, url2, source, target, score] = line.split("\t")
        langid_source = langid.classify(source.lower())
        langid_target = langid.classify(target.lower())
        if not source.strip():
            deletions["source_empty"].append(source)
        elif not target.strip():
            deletions["target_empty"].append(target)
        elif langid_source[0] != args.source_lang and langid_source[1] > 0.9:
            deletions["source_lang"].append(
                "%s\t%s\t%f" % (source, langid_source[0], langid_source[1]))
        elif langid_target[0] != args.target_lang and langid_target[1] > 0.9:
            deletions["target_lang"].append(
                "%s\t%s\t%f" % (target, langid_target[0], langid_target[1]))
Пример #31
0
def main(input_dir: str, output_dir: str, args: argparse.Namespace):

    input_files = glob.glob(f"{input_dir}/*")
    input_files.sort()
    for path in input_files:

        scheme = FileScheme(path, output_dir=output_dir)

        scheme.add_step(1, "extracted_texts")
        out_path = scheme.path(1)
        if not scheme.file_exists(out_path):
            extractor = TextExtractor.create_by_file_ext(path)
            extractor.extract(path, out_path)

        scheme.add_step(2, "manual_cleaning")
        copy_path = scheme.todo_path(2)
        done_path = scheme.done_path(2)
        if not (scheme.file_exists(copy_path)
                or scheme.file_exists(done_path)):
            scheme.copy_file(out_path, copy_path, create_dirs=True)
        if args.skip_manual_cleaning and not scheme.file_exists(done_path):
            scheme.copy_file(out_path, done_path, create_dirs=True)

        scheme.add_step(3, "cleanup_whitespace")
        out_path = scheme.path(3)
        if not scheme.file_exists(out_path):
            done_path = scheme.done_path(2)
            if not scheme.file_exists(done_path):
                print(
                    "WARN:",
                    f"No input file at: '{done_path}'. Was this file manually cleaned?"
                )
                continue
            lines = scheme.read_lines(scheme.done_path(2))
            lines = cleaning.cleanup_whitespace(lines)
            scheme.write_lines(out_path, lines, create_dirs=True)

        # detect the document language
        language_code = ""
        if scheme.file_exists(scheme.path(3)):
            content = scheme.read_file(scheme.path(3))
            langid.set_languages(_project_languages.keys())
            language_code, _ = langid.classify(content)

        scheme.add_step(4, "de_hyphenate")
        out_path = scheme.path(4)
        if not scheme.file_exists(out_path):
            lines = scheme.read_lines(scheme.path(3))
            lines = cleaning.remove_end_of_line_hyphens(
                lines, language_code, args.always_combine_hyphens)
            scheme.write_lines(out_path, lines, create_dirs=True)

        scheme.add_step(5, "tokenize_sententces")
        out_path = scheme.path(5)
        if not scheme.file_exists(out_path):
            content = scheme.read_file(scheme.path(4))
            tokenizer = sentence_tokenizer(_project_languages[language_code])
            sentences = tokenizer.tokenize(content)
            # since no hyphen should exist at this point, we can just cat the lines together
            sentences = map(lambda s: re.sub(r"\s+", " ", s), sentences)
            sentences = map(str.strip, sentences)
            scheme.write_lines(out_path, sentences, create_dirs=True)

        scheme.add_step(6, "escape_xml_chars")
        out_path = scheme.path(6)
        if not scheme.file_exists(out_path):
            content = scheme.read_file(scheme.path(5))
            new_content = cleaning.escape_xml_chars(content)
            scheme.write_file(out_path, new_content, create_dirs=True)

        scheme.add_step(7, "sentence_window")
        out_path = scheme.path(7)
        if args.random_window > 0 and not scheme.file_exists(out_path):
            lines = scheme.read_lines(scheme.path(6))
            if len(lines) > args.random_window:
                begin = random.randrange(0, len(lines) - args.random_window)
                lines = lines[begin:(begin + args.random_window)]
            scheme.write_lines(out_path, lines, create_dirs=True)
        else:
            # reset for the last step
            out_path = scheme.path(6)

        scheme.add_step(42, "separate_by_language")
        path_from_last_step = out_path
        language_dir = os.path.join(scheme.dirname(42), language_code)
        os.makedirs(language_dir, exist_ok=True)
        scheme.copy_file(path_from_last_step, language_dir)
Пример #32
0
    def detect_language(self, str):
        """
           Detect language of given string. 
           First, attempt to detect language with langid. Langid uses probabilities, so if probability score is low,
           use characters in string to detect language.
    
           :param str str: The given string
           :return: language in 2 letter ISO format.
           :rtype: str
        """

        # If passed an empty string, return empty result
        if len(str.strip()) < 1:
            return ''

        str_langid = str.strip()
        str = str.lower().strip()

        detected_langid_value = ''

        langid.set_languages(self.langid_languages)
        # langid.set_languages(['de', 'fr', 'it'])
        detected_lang = self.langid_identifier.classify(str_langid)
        # print(detected_lang)
        # if detected_lang[1] < 0.5:
        #     print(detected_lang)
        #     print(str_langid)
        '''
        If the statistical probability of a detected language is larger than
        0.9999, return that language, if it is included in langid_languages.
        If not, set it equal to 'en'.
        '''

        if detected_lang[1] > 0.9999 and detected_lang[0] in self.langid_languages:
            detected_langid_value = detected_lang[0]
            return detected_langid_value
        # else:
        #     detected_langid_value = 'en'

        # Greek characters
        chars_el = set('αβγδεζηθικλμνξοπρσςτυφχψω')
        # Latin characters
        chars_en = set('abcdefghijklmnopqrstuvwxyz')
        # French characters
        chars_fr = set('éàèùâêîôûçëïü')
        # German characters
        chars_de = set('äöüß')
        # Turkish characters
        chars_tr = set('şĞİğı')
        # Spanish characters
        chars_es = set('ñóíáã')
        # Slovak characters
        chars_sk = set('ýúčžň')
        # Czech characters
        chars_cz = set('řťšůď')

        '''
        If a greek character exists, return greek language immediately
        '''
        if any((c in chars_el) for c in str):
            return 'el'

        return_value = ''
        # if 'LATIN' in unicodedata.name(str.strip()[0]):
        if any((c in chars_en) for c in str):
            if any((c in chars_fr) for c in str):
                return_value = 'fr'
            if any((c in chars_de) for c in str):
                return_value = 'de'
            if any((c in chars_tr) for c in str):
                return_value = 'tr'
            if any((c in chars_es) for c in str):
                return_value = 'es'
            if any((c in chars_sk) for c in str):
                return_value = 'sk'
            if any((c in chars_cz) for c in str):
                return_value = 'cz'
            return_value = 'en'

        '''
        If no language is detected, return an empty string.
        This helps set DC values with no language.
        xstr = lambda s: s or ""
        '''
        return return_value
Пример #33
0
import json
import logging
from flask import Blueprint, jsonify, request
import langid
from transformers import MarianTokenizer, MarianMTModel

from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
mt_helsinki_nlp = Blueprint('mt_helsinki_nlp',
                            __name__,
                            template_folder='templates')

SERVICE_NAME_COMPONENT = os.environ['SERVICE_NAME_COMPONENT']
supported_langs = ['ru', 'es', 'de', 'fr']
langid.set_languages(supported_langs)
models = {
    lang: MarianMTModel.from_pretrained(
        'Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang))
    for lang in supported_langs
}
tokenizers = {
    lang: MarianTokenizer.from_pretrained(
        'Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang))
    for lang in supported_langs
}


@mt_helsinki_nlp.route("/annotatequestion", methods=['POST'])
def qanary_service():
    """the POST endpoint required for a Qanary service"""
from urllib.parse import urlsplit
from DomainFinderSrc.Utilities import FilePath, FileIO
from unittest import TestCase
from DomainFinderSrc.Scrapers.LinkChecker import LinkChecker
import shortuuid
from DomainFinderSrc.Utilities.StrUtility import StrUtility
import langid

# https://zh.wikipedia.org/wiki/ISO_639-1
good_lang_code = ['en', 'de', 'fr', 'es', 'fr', 'it', 'pt', 'ga', 'el', 'da']
bad_lang_code = ['zh', 'ja', 'ko', 'ru', 'vi']

langid.set_languages(langs=good_lang_code+bad_lang_code)


class LanguageTest(TestCase):
    def test_from_text(self):
        result = langid.classify('i made an icecream for you.')
        print(result)

    def test_from_url(self):
        response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848")
        print(langid.classify(response.text))

Пример #35
0
import json
from itertools import islice
import langid
import sys

langid.set_languages(["nl", "en"])
for line_no, line in enumerate(islice(open(sys.argv[1]), None)):
    if line_no > 0:
        if line_no % 1000 == 0:
            print("%s" % (line_no), file=sys.stderr)
        elif line_no % 100 == 0:
            print(".", end=" ", file=sys.stderr)

    try:
        user = json.loads(line)
    except ValueError:
        continue

    uid = user.get("user_id", None)

    reviews = user.get("reviews", [])

    if reviews:
        counter = 0
        for rn, review in enumerate(reviews, 1):
            text = " ".join(["".join(sent) for sent in review.get("text", [])])
            review["langid"] = langid.classify(text)[0]
            print(text)

    print(json.dumps(user, ensure_ascii=False))
Пример #36
0
from typing import List

from fastapi import APIRouter
from pydantic import BaseModel

import langid
langid.set_languages(['de', 'en'])  # ISO 639-1 codes

#
# not a good idea to work with global variables like this.
#
from backend import api

DB_INDEX_AUTOCOMPLETE = "autocomplete"

router = APIRouter()


class Request(BaseModel):
    search: str


def addQuestionToAutocomplete(question: str):
    # todo: if it already exists; we need to increment count;
    body = {'phrase': question, 'count': 1}
    res = api.elasticsearch_client.index(index=DB_INDEX_AUTOCOMPLETE,
                                         body=body)


@router.get("/query/autocomplete")
def ask(search: str):
# This parses the EC declassified archives and creates a single file per language present in the original files.

# Simon Hengchen - [email protected] - http://homepages.ulb.ac.be/~shengche


import codecs
import re
import io
import os,glob
import langid
from multiprocessing import Pool 
from langid.langid import LanguageIdentifier, model


identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
langid.set_languages(['en','it','fr','nl','de','da','el'])  # we restrict the possible languages to the official ones of the EU between 1958 and 1982, scope of our corpus

def get_path(dossier):
	return (os.path.join(dossier, f)
		for f in os.listdir(dossier)
		if 'txt' in f)
	

def langiding(chemin):
	f = io.open(chemin,"r",encoding="utf8")

	fen = io.open(chemin+"_en","w",encoding="utf8") # one file for each language
	fit = io.open(chemin+"_it","w",encoding="utf8")
	ffr = io.open(chemin+"_fr","w",encoding="utf8")
	fnl = io.open(chemin+"_nl","w",encoding="utf8")
	fde = io.open(chemin+"_de","w",encoding="utf8")
Пример #38
0
import re, codecs, os, sys, getopt
#import urllib,urllib2
#import json
from bs4 import BeautifulSoup as bs
from collections import defaultdict
#from jellyfish import levenshtein_distance as ld
#from nltk.tokenize import wordpunct_tokenize
import langid
langid.set_languages(['en','nl','fr'])

def get_files(folder):
  listbase = 'corpus/'+folder+"/"
  listall = os.listdir(listbase)
  listing = [f for f in listall if len(f) in [22,23,24]]
  return (listbase,listing)

folders = os.listdir('corpus/')
#folders = []

#output = codecs.open("output.txt",'a',encoding='utf-8')

#gsc = codecs.open("gsc2.txt").readlines()
#gs = [l.split("\t") for l in gsc]

for folder in sorted(['TYT']):
  #dic = defaultdict(lambda:0)
  listbase,listing = get_files(folder)
  for f in sorted(listing):
    ff = listbase+f
    text = open(ff).read()
    soup = bs(text,"xml")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin)
    parser.add_argument("outfile", nargs="?", type=argparse.FileType("w"), default=sys.stdout)
    parser.add_argument("-minscore", type=float, default=0, help="minimum score from hunalign")
    parser.add_argument("-s", "--lang1", help="source language", dest="source_lang", default="en")
    parser.add_argument("-t", "--lang2", help="target language", dest="target_lang", default="fr")
    args = parser.parse_args()

    deletions = defaultdict(list)

    n_written = 0
    n_total = 0
    langid.set_languages([args.source_lang, args.target_lang])
    for line in args.infile:
        n_total += 1
        source, target, score = line.split("\t")
        if float(score) < args.minscore:
            deletions["low score"].append("")
        if source == target:
            deletions["identical"].append(target)
            continue
        if not source.strip():
            deletions["source_empty"].append("")
            continue
        elif not target.strip():
            deletions["target_empty"].append("")
            continue
import pandas as pd, pycld2 as cld2, re, langid
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

from tqdm import tqdm

langid.set_languages(['ru','uk','en'])

with open('../psql_engine.txt') as f:
    psql = create_engine(f.read())

def get_lang(text):
    rel, _, matches = cld2.detect(text)
    if not rel:
        return
    matches = list(filter(lambda m: m[1] in ['ru', 'uk', 'en'], matches))
    if len(matches) == 0:
        return langid.classify(text)[0]
    return matches[0][1]
    
chunks = pd.read_sql('''SELECT html_id, ra_summary FROM htmls
                        WHERE lang isnull and ra_summary notnull; 
                     ''', psql, chunksize=20000)

for df in tqdm(chunks):
    df['text'] = df.ra_summary.apply(lambda s: re.sub('\s+', ' ', BeautifulSoup(s, 'lxml').get_text()).strip())
    df['text'] = df.text.apply(lambda t: ''.join([ch for ch in t if ch.isprintable()]))
    df['lang'] = df.text.apply(get_lang)
    vals = ',\n'.join([f"({html_id}, '{lang}')" for html_id, lang
                       in df.loc[pd.notnull(df.lang)].reindex(['html_id', 'lang'], axis=1).values])
    psql.execute(f'''
Пример #41
0
    for lsfile in sfile:
        lsfile = lsfile.rstrip()
        remlist.append(lsfile)

if args.regexFromFile:
    regfile = codecs.open(args.regexFromFile, "r", encoding="utf-8")
    reglist = []
    for lsfile in regfile:
        lsfile = lsfile.rstrip()
        reglist.append(lsfile)

if args.vSetLanguages:
    toset = []
    for l in args.vSetLanguages.split(","):
        toset.append(l)
    langid.set_languages(toset)

for linia in entrada:
    toWrite = True
    linia = linia.strip()
    camps = linia.split("\t")
    if len(camps) >= 1:
        slsegment = camps[0]
        tlsegment = ""
    if len(camps) >= 2:
        tlsegment = camps[1]

    if args.unescape_html and toWrite:
        slsegment = unescape_html(slsegment)
        tlsegment = unescape_html(tlsegment)
    if args.remove_tags and toWrite:
Пример #42
0
def language_id(content):
    langid.set_languages(['en', 'zh'])
    return langid.classify(content)
Пример #43
0
  d['name_lower'] = d.name.apply(capital_encode)
  #d['name_lower'] = d.name
  return d


if os.path.exists('lang_train_cache.csv'):
  train = pandas.DataFrame.from_csv('lang_train_cache.csv')
  test = pandas.DataFrame.from_csv('lang_test_cache.csv')
else:
  print 'can not load from cache'
  d = load_twitter('data/lang/new_lang_data.txt.gz')
  lang_counts = d.groupby('lang')['lang'].agg('count')
  langs = set(lang_counts[lang_counts.values > 10000].index)
  d = d[d.lang.apply(lambda x: x in langs)]  # use only big languages
  
  langid.set_languages(langs)

  langid_labels = []
  langid_scores = []
  for i, idx in enumerate(d.index):
    if i % 10000 == 0:
      print i
    langid_label, langid_score = langid.classify(d.text[idx])
    langid_labels.append(langid_label)
    langid_scores.append(langid_score)
  d['lid_label'] = langid_labels
  d['lid_score'] = langid_scores

  d = d[(d.lid_score > 0.995) & (d.lang == d.lid_label)]

  # random partioning
Пример #44
0
def langidFeature(word):
	import langid
	langid.set_languages(['zu','en'])
	return langid.classify(word)[0]
Пример #45
0
try:
    import langid
    LANGID_FLAG = True
except ImportError:
    LANGID_FLAG = False

from .lru import LRUCache
from .settings import (DETECTION_LANGUAGES, LRU_SIZE, MAX_REPETITIONS,
                       MIN_DUPLCHECK_SIZE)

LOGGER = logging.getLogger(__name__)

LRU_TEST = LRUCache(maxsize=LRU_SIZE)

if LANGID_FLAG is True:
    langid.set_languages(DETECTION_LANGUAGES)

RE_FILTER = re.compile(
    r'\W*(Facebook|Twitter|Google|Linkedin|Whatsapp|Xing|Instagram|Pinterest|PDF|E-Mail|Drucken)$',
    flags=re.IGNORECASE)
# |.hnliche Beitr| Instagram
# (r'\W*(Gef.llt mir|[Ss]hare (on|via)|Fill in your details below|Trage deine Daten unten|Kommentar verfassen|Bitte logge dich|Hinterlasse einen Kommentar| to %s| mit %s)', line) or


def put_in_cache(body):
    '''Implement LRU cache'''
    global LRU_TEST
    for element in body:
        try:
            teststring = ' '.join(element.itertext())
        except AttributeError:  # justext Paragraph
Пример #46
0
__date__ = "$30-jul-2015 12:13:01$"

import langid
import sys
import codecs


if (len(sys.argv) > 1):
    name = sys.argv[1]
else:
    print "no parametros"

corrects = 0
total = 0

langid.set_languages(['es','fr','it','en','pt','ro'])

if (name):
    f = open(name,"r")    
    
    for line in f:        
        text,lang = line.split(";")
        lang = lang.strip()
        total = total + 1
        ident,conf = langid.classify(text)
        ident = ident.strip()        
        if (lang == ident):
            corrects = corrects + 1
    
    print "Aciertos ",corrects,"de",total,"Porcentaje",(float(corrects) / float(total))*100
    
def get_lang(words):
    langid.set_languages(['it', 'en', 'de', 'fr', 'es', 'ja'])
    array = langid.classify(words)
    lang = array[0]
    return lang
Пример #48
0
__author__ = 'alex'
from forms import *
from django.template import RequestContext
from django.shortcuts import render_to_response
from bs4 import BeautifulSoup
import langid
import wikipedia_updated

supported_prefixes = [prefix[0] for prefix in SUPPORTED_PREFIXES]
print 'supported_prefixes: ' + str(supported_prefixes)
langid.set_languages(supported_prefixes)

def searchView(request):
    if ('search_query' in request.GET) and request.GET['search_query'].strip():
        search_query = request.GET['search_query']
        if ('prefix' in request.GET) and request.GET['prefix'].strip():
            prefix = request.GET['prefix']

            print BeautifulSoup(search_query,from_encoding="utf-8")

            detected_langid = langid.classify(search_query)[0]
            print "Detected Lang: %s" % detected_langid
            print "Translate To: %s" % prefix

            if prefix == 'en' and detected_langid == 'en':
                detected_langid = 'de'
            if prefix == 'de' and detected_langid == 'de':
                detected_langid = 'en'

            translationFound = False
            wikipedia_updated.set_lang(detected_langid)
'''
Created on 05-Nov-2015

@author: unni
'''



import langid
from db_manager import connect_to_db

host_name = "10.5.23.213"
user_name = "root"
password = "******"


def find_lang(db_name, table):
    cursor_mysql = connect_to_db(host_name, user_name, password)
    cursor_mysql.execute("select distinct tweetText from %s.%s"%(db_name,table))
    lang_dict = {'hi':0, 'en':0}
    for row in cursor_mysql.fetchall():
        lang = langid.classify(row[0])
        lang_dict[lang[0]]+=1
    print lang_dict 
if __name__ == '__main__':
    langid.set_languages(['hi', 'en'])   
    #find_lang('Events', 'BigBillionDay')
    print langid.classify('yeh hindi hein')
                        | \.\.\.
                        | [][.,;"'?():-_`]
                        '''
        tokenized_sents = []
        print '*** 2. Tokenization'
        print 'Tokenization in progress ...'
        for sent in sents:
            tokenized_sent = nltk.regexp_tokenize(sent, token_pattern)
            tokenized_sents.append(tokenized_sent)

        #print(tokenized_sents)

        # 3. Spracherkennung (evtl. sogar satzweise moeglich!)
        print '*** 3. Language recognition (not in use)'
        print_i= 1
        langid.set_languages(['en', 'de'])
        detected_language = ''
        language_matching = []
        for sent in sents:
            detected_language = langid.classify(sent)
            #print(print_i, sent, detected_language)
            language_matching.append([print_i, sent, detected_language[0]])
            print_i += 1
        print(language_matching)

        # 4. Normalisierung (lowercase, stopwords, spellcheck, thesauri)
        print '*** 4. Normalization (lowercase, stopwords, spellcheck)'
        stopwords_en = nltk.corpus.stopwords.words('english')  # Definition der stopword Corpora
        stopwords_de = nltk.corpus.stopwords.words('german')
        print 'Stripping the stopwords ...'
        for sent in sents: