Пример #1
0
def round_series_up(s: dd.Series) -> dd.Series:
    """Apply roundup function to all elements of `s`"""
    return s.apply(roundup, meta=pd.Series(data=[], dtype=np.float32))
Пример #2
0
def calc_word_freq(
    srs: dd.Series,
    top_words: int = 30,
    stopword: bool = True,
    lemmatize: bool = False,
    stem: bool = False,
) -> Dict[str, Any]:
    """
    Parse a categorical column of text data into words, and then
    compute the frequency distribution of words and the total
    number of words.

    Parameters
    ----------
    srs
        One categorical column
    top_words
        Number of highest frequency words to show in the
        wordcloud and word frequency bar chart
    stopword
        If True, remove stop words, else keep them
    lemmatize
        If True, lemmatize the words before computing
        the word frequencies, else don't
    stem
        If True, extract the stem of the words before
        computing the word frequencies, else don't
    """
    # pylint: disable=unnecessary-lambda
    if stopword:
        # use a regex to replace stop words with empty string
        srs = srs.str.replace(
            r"\b(?:{})\b".format("|".join(english_stopwords)), "")
    # replace all non-alphanumeric characters with an empty string, and convert to lowercase
    srs = srs.str.replace(r"[^\w+ ]", "").str.lower()

    # split each string on whitespace into words then apply "explode()" to "stack" all
    # the words into a series
    # NOTE this is slow. One possibly better solution: after .split(), count the words
    # immediately rather than create a new series with .explode() and apply
    # .value_counts()
    srs = srs.str.split().explode()

    # lemmatize and stem
    if lemmatize or stem:
        srs = srs.dropna()
    if lemmatize:
        lem = WordNetLemmatizer()
        srs = srs.apply(lambda x: lem.lemmatize(x), meta=(srs.name, "object"))
    if stem:
        porter = PorterStemmer()
        srs = srs.apply(lambda x: porter.stem(x), meta=(srs.name, "object"))

    # counts of words, excludes null values
    word_cnts = srs.value_counts(sort=False)
    # total number of words
    nwords = word_cnts.sum()
    # total uniq words
    nuniq_words = word_cnts.shape[0]
    # words with the highest frequency
    fnl_word_cnts = word_cnts.nlargest(n=top_words)

    return {
        "word_cnts": fnl_word_cnts,
        "nwords": nwords,
        "nuniq_words": nuniq_words
    }