Exemplo n.º 1
0
def clean_preprocess(text, do_farasa_tokenization, farasa):
    text = " ".join(
        bt._run_split_on_punc(
            preprocess(text,
                       do_farasa_tokenization=do_farasa_tokenization,
                       farasa=farasa)))
    text = " ".join(text.split())  #removes extra whitespaces
    return text
Exemplo n.º 2
0
    def transform(self, sentences_list, extract_and_paste_emojies=False):
        """
        transforming data and applying all pre-processing steps over it. In case 'fit' is required, it will yiled an
        error in case data is not fitted yet

        Parameters
        ----------
        :param sentences_list: list (of arabic sentences)
            list of sentences to apply the function on. Each sentence is treated independently
        :param extract_and_paste_emojies: boolean. Default: False
            whether to handle emojies is a special way. Currently we only know how to handle emojies in a very specific
            way (extract them and then paste them at the end of the sentence - not ideal)
            TODO: handle empjies in a better way (do not convert them to ?? and "leave" them as is in the sentence)

        :return: list
            list of transformed arabic sentences. Same input list, but after the transform function has been applied
            over all of them

        """
        farasa_segmenter = FarasaSegmenter(interactive=True)
        new_sentences_list = list()
        # looping over each sentence
        for cur_text in sentences_list:
            # in case we decided to use the farase preprocess
            if self.use_default_farsa_preprocess:
                preprocessed_text = preprocess(cur_text,
                                               do_farasa_tokenization=True,
                                               farasa=farasa_segmenter,
                                               use_farasapy=True)
                preprocessed_text_as_list = preprocessed_text.split(" ")
                # removal of punctuation (e.g., '?', '!?!')
                preprocessed_text_as_list = [
                    cur_word for cur_word in preprocessed_text_as_list
                    if not all(j in string.punctuation for j in cur_word)
                ]
                if extract_and_paste_emojies:
                    emojies_found = self.extract_emojis(text=cur_text)
                    preprocessed_text_as_list.extend(emojies_found)
                new_sentences_list.append(' '.join(preprocessed_text_as_list))
            # currently not doing anything in such case, only supports the default case
            else:
                new_sentences_list.append(cur_text)
        return new_sentences_list