def round_series_up(s: dd.Series) -> dd.Series: """Apply roundup function to all elements of `s`""" return s.apply(roundup, meta=pd.Series(data=[], dtype=np.float32))
def calc_word_freq( srs: dd.Series, top_words: int = 30, stopword: bool = True, lemmatize: bool = False, stem: bool = False, ) -> Dict[str, Any]: """ Parse a categorical column of text data into words, and then compute the frequency distribution of words and the total number of words. Parameters ---------- srs One categorical column top_words Number of highest frequency words to show in the wordcloud and word frequency bar chart stopword If True, remove stop words, else keep them lemmatize If True, lemmatize the words before computing the word frequencies, else don't stem If True, extract the stem of the words before computing the word frequencies, else don't """ # pylint: disable=unnecessary-lambda if stopword: # use a regex to replace stop words with empty string srs = srs.str.replace( r"\b(?:{})\b".format("|".join(english_stopwords)), "") # replace all non-alphanumeric characters with an empty string, and convert to lowercase srs = srs.str.replace(r"[^\w+ ]", "").str.lower() # split each string on whitespace into words then apply "explode()" to "stack" all # the words into a series # NOTE this is slow. One possibly better solution: after .split(), count the words # immediately rather than create a new series with .explode() and apply # .value_counts() srs = srs.str.split().explode() # lemmatize and stem if lemmatize or stem: srs = srs.dropna() if lemmatize: lem = WordNetLemmatizer() srs = srs.apply(lambda x: lem.lemmatize(x), meta=(srs.name, "object")) if stem: porter = PorterStemmer() srs = srs.apply(lambda x: porter.stem(x), meta=(srs.name, "object")) # counts of words, excludes null values word_cnts = srs.value_counts(sort=False) # total number of words nwords = word_cnts.sum() # total uniq words nuniq_words = word_cnts.shape[0] # words with the highest frequency fnl_word_cnts = word_cnts.nlargest(n=top_words) return { "word_cnts": fnl_word_cnts, "nwords": nwords, "nuniq_words": nuniq_words }