Python simple_textcleaning示例，malaya.text.function.simple_textcleaning Python示例

示例#1

0

显示文件

    def _vectorize_sentence(self,
                            corpus,
                            isi_penting,
                            important_words=10,
                            batch_size=10,
                            retry=5,
                            **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        vectors = self._batching(cleaned_strings, batch_size=batch_size)
        if isi_penting:
            vectors_isi_penting = self._batching([isi_penting],
                                                 batch_size=batch_size)

        if 'DeepSkipThought' in str(self.vectorizer):
            top_words = []
        else:
            if hasattr(self.vectorizer, 'attention'):
                attentions = self.vectorizer.attention(corpus, **kwargs)
                flatten = list(itertools.chain(*attentions))
                r = {}
                for f in flatten:
                    c = simple_textcleaning(f[0])
                    if c in STOPWORDS:
                        continue
                    if c not in r:
                        r[c] = f[1]
                    else:
                        r[c] += f[1]
                top_words = sorted(r, key=r.get,
                                   reverse=True)[:important_words]
            else:
                top_words = []

        similar = cosine_similarity(vectors, vectors)
        if isi_penting:
            similar_isi_penting = cosine_similarity(vectors,
                                                    vectors_isi_penting)
            similar = similar * similar_isi_penting
        else:
            similar[similar >= 0.99] = 0
        scores = pagerank(similar + 1e-6, retry)
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        return (
            original_strings,
            ranked_sentences,
            top_words,
            cluster_words(top_words),
        )

示例#2

0

显示文件

    def _vectorize_word(self,
                        corpus,
                        isi_penting,
                        window_size=10,
                        important_words=10,
                        batch_size=10,
                        **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        ngram_list, splitted = create_ngram(' '.join(cleaned_strings),
                                            ngram=window_size)
        splitted = ' '.join(original_strings).split()
        if isi_penting:
            isi_penting = [isi_penting]
        else:
            isi_penting = original_strings

        vectors = self._batching(ngram_list, batch_size=batch_size)
        vectors_isi_penting = self._batching(isi_penting,
                                             batch_size=batch_size)

        if 'DeepSkipThought' in str(self.vectorizer):
            top_words = []
        else:
            if hasattr(self.vectorizer, 'attention') and important_words > 0:
                attentions = self.vectorizer.attention(corpus, **kwargs)
                flatten = list(itertools.chain(*attentions))
                r = {}
                for f in flatten:
                    c = simple_textcleaning(f[0])
                    if c in STOPWORDS:
                        continue
                    if c not in r:
                        r[c] = f[1]
                    else:
                        r[c] += f[1]
                top_words = sorted(r, key=r.get,
                                   reverse=True)[:important_words]
            else:
                top_words = []

        vectors_isi_penting = np.mean(vectors_isi_penting, axis=0)
        vectors_isi_penting = np.expand_dims(vectors_isi_penting, axis=0)
        similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting)
        scores = similar_isi_penting[:, 0]
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True)
        return (splitted, ranked_sentences, top_words,
                cluster_words(top_words))

示例#3

0

显示文件

文件： augmentation.py 项目： huseinzol05/malaya

def socialmedia_form(word: str):
    """
    augmenting a word into socialmedia form.

    Parameters
    ----------
    word: str

    Returns
    -------
    result: List[str]
    """

    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    results = []

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + word[1:])

        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(word[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    return list(set(results))

示例#4

0

显示文件

文件： augmentation.py 项目： huseinzol05/malaya

def vowel_alternate(word: str, threshold: float = 0.5):
    """
    augmenting a word into vowel alternate.

    vowel_alternate('singapore')
    -> sngpore

    vowel_alternate('kampung')
    -> kmpng

    vowel_alternate('ayam')
    -> aym

    Parameters
    ----------
    word: str
    threshold: float, optional (default=0.5)

    Returns
    -------
    result: str
    """

    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    word = list(word[:])
    i = 0
    while i < len(word) - 2:
        subword = word[i: i + 3]
        if subword[0] in consonants and subword[1] in vowels and subword[2] in consonants \
                and random.random() >= threshold:
            word.pop(i + 1)
        i += 1
    return ''.join(word)

示例#5

0

显示文件

def shortform(
    word: str,
    augment_vowel: bool = True,
    augment_consonant: bool = True,
    prob_delete_vowel: float = 0.5,
    **kwargs,
):
    """
    augmenting a formal word into socialmedia form. Purposely typo, purposely delete some vowels, 
    purposely replaced some subwords into slang subwords.

    Parameters
    ----------
    word: str
    augment_vowel: bool, (default=True)
        if True, will augment vowels for each samples generated.
    augment_consonant: bool, (default=True)
        if True, will augment consonants for each samples generated.
    prob_delete_vowel: float, (default=0.5)
        probability to delete a vowel.

    Returns
    -------
    result: list
    """

    if not 0 < prob_delete_vowel < 1:
        raise ValueError(
            'prob_delete_vowel must be bigger than 0 and less than 1')
    word = simple_textcleaning(word)
    if not len(word):
        raise ValueError('word is too short to augment shortform.')

    check_file(PATH_NGRAM['sentencepiece'], S3_PATH_NGRAM['sentencepiece'],
               **kwargs)

    vocab = PATH_NGRAM['sentencepiece']['vocab']
    vocab_model = PATH_NGRAM['sentencepiece']['model']
    tokenizer = load_sentencepiece(vocab, vocab_model)

    replace_consonants = {
        'n': 'm',
        't': 'y',
        'r': 't',
        'g': 'h',
        'j': 'k',
        'k': 'l',
        'd': 's',
        'd': 'f',
        'g': 'f',
        'b': 'n',
    }

    replace_vowels = {'u': 'i', 'i': 'o', 'o': 'u'}

    results = [word]

    if len(word) > 1:

        if word[-1] == 'a' and word[-2] in consonants:
            results.append(word[:-1] + 'e')

        if word[0] == 'f' and word[-1] == 'r':
            results.append('p' + words[1:])

        if word[-2] in consonants and word[-1] in vowels:
            results.append(word + 'k')

        if word[-2] in vowels and word[-1] == 'h':
            results.append(word[:-1])

    if len(word) > 2:
        if word[-3] in consonants and word[-2:] == 'ar':
            results.append(words[:-2] + 'o')

        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:
            results.append(word[1:])

        if word[-3] in consonants and word[-2:] == 'ng':
            results.append(word[:-2] + 'g')

        if word[1:3] == 'ng':
            results.append(word[:1] + x[2:])

    if augment_consonant:
        result_consonants = []
        for k, v in replace_consonants.items():
            for r in results:
                result_consonants.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_consonants)

    if augment_vowel:
        result_vowels = []
        for k, v in replace_vowels.items():
            for r in results:
                result_vowels.extend([r.replace(k, v), r.replace(v, k)])
        results.extend(result_vowels)

    result_deleted = []
    for s in results:
        deleted = []
        for c in s:
            if random.random() > prob_delete_vowel and c in vowels:
                continue
            else:
                deleted.append(c)
        result_deleted.append(''.join(deleted))
    results.extend(result_deleted)

    filtered = []
    for s in results:
        t = tokenizer.tokenize(s)
        if len(t) == 1:
            filtered.append(s)
            continue
        if t[0] == '▁':
            continue
        if any([len(w) < 3 for w in t]):
            continue
        filtered.append(s)

    return list(set(filtered))

示例#6

0

显示文件

    def summarize(self,
                  corpus,
                  top_k: int = 3,
                  important_words: int = 3,
                  **kwargs):
        """
        Summarize list of strings / corpus

        Parameters
        ----------
        corpus: str, list

        top_k: int, (default=3)
            number of summarized strings.
        important_words: int, (default=3)
            number of important words.

        Returns
        -------
        string: summarized string
        """
        if not isinstance(corpus, list) and not isinstance(corpus, str):
            raise ValueError('corpus must be a list')
        if isinstance(corpus, list):
            if not isinstance(corpus[0], str):
                raise ValueError('corpus must be list of strings')

        if isinstance(corpus, str):
            corpus = split_into_sentences(corpus)
        else:
            corpus = '. '.join(corpus)
            corpus = split_into_sentences(corpus)

        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer):

            sequences = skip_thought.batch_sequence(
                cleaned_strings,
                self._vectorizer.dictionary,
                maxlen=self._vectorizer._maxlen,
            )
            vectors, attention = self._vectorizer._sess.run(
                [self._vectorizer._logits, self._vectorizer._attention],
                feed_dict={self._vectorizer._X: np.array(sequences)},
            )
            attention = attention.sum(axis=0)
            indices = np.argsort(attention)[::-1]
            top_words = [
                self._vectorizer._rev_dictionary[i] for i in indices
                if self._vectorizer._rev_dictionary[i] not in STOPWORDS
            ][:important_words]

        else:
            vectors = self._vectorizer.vectorize(corpus)
            attentions = self._vectorizer.attention(corpus, **kwargs)
            flatten = list(itertools.chain(*attentions))
            r = {}
            for f in flatten:
                c = simple_textcleaning(f[0])
                if c in STOPWORDS:
                    continue
                if c not in r:
                    r[c] = f[1]
                else:
                    r[c] += f[1]
            top_words = sorted(r, key=r.get, reverse=True)[:important_words]

        similar = cosine_similarity(vectors, vectors)
        similar[similar >= 0.99999] = 0
        scores = pagerank(similar)
        ranked_sentences = sorted(
            ((scores[i], s) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        summary = [r[1] for r in ranked_sentences[:top_k]]

        return {
            'summary': ' '.join(summary),
            'top-words': top_words,
            'cluster-top-words': cluster_words(top_words),
        }