Exemplo n.º 1
0
def rake(
    string: str,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default=None)
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
        For automatic Ngram generator.

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]
Exemplo n.º 2
0
def rake(string: str,
         model=None,
         top_k: int = 5,
         auto_ngram: bool = True,
         ngram_method: str = 'bow',
         ngram: Tuple[int, int] = (1, 1),
         atleast: int = 1,
         stop_words: List[str] = STOPWORDS,
         **kwargs):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        Transformer model or any model has `attention` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stop_words: list, (default=malaya.text.function.STOPWORDS)
        list of stop words to remove. 

    Returns
    -------
    result: Tuple[float, str]
    """

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must has or `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in ('bow', 'skipgram'):
        raise ValueError("ngram_method must be in  ['bow', 'skip-gram']")
    if auto_ngram and not len(stop_words):
        raise ValueError('insert stop_words if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stop_words)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stop_words=stop_words,
                      **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]