def test_frequencies_entries(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["frequencies"] == [('random', 2), ('1 world', 1), ('hello world 2', 1), ('hello hello', 1), ('world world', 1)]
def test_frequencies_entries(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["frequencies"] == [ ("random", 2), ("1 world", 1), ("hello world 2", 1), ("hello hello", 1), ("world world", 1), ]
def frequency(df, columnIndex, options={}): cutoff = 50 useWords = False column = df[df.columns[columnIndex]] if type(options) is dict: if options.get("useWords", False) is True and not issubclass( column.dtype.type, np.datetime64) and not issubclass( column.dtype.type, np.number): useWords = True if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50: cutoff = int(options["cutoff"]) values = [] counts = [] if useWords: tuples = textAnalysis(column)["word_frequencies"] for x in reversed(tuples[:cutoff]): values.append(x[0].decode("utf-8", "replace") if isinstance( x[0], basestring) else x[0]) counts.append(x[1]) else: tuples = column.value_counts() for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1): values.append( tuples.index[index].decode("utf-8", "replace") if isinstance( tuples.index[index], basestring) else tuples.index[index]) counts.append(tuples.iloc[index]) pyplot.style.use('ggplot') fig = pyplot.figure(figsize=(10, 8)) ax = fig.add_subplot(111) ax.set_ylim(-0.5, len(values) - 0.5) ax.barh(np.arange(len(values)), counts, tick_label=values, align="center") ax.set_xlabel("Frequency") ax.set_ylabel("Value") stream = StringIO() fig.savefig(stream, format="png", dpi=300) pyplot.close(fig) return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
def frequency(df, columnIndex, options={}): cutoff = 50 useWords = False column = df[df.columns[columnIndex]] if type(options) is dict: if options.get("useWords", False) is True and not issubclass(column.dtype.type, np.datetime64) and not issubclass(column.dtype.type, np.number): useWords = True if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50: cutoff = int(options["cutoff"]) values = [] counts = [] if useWords: tuples = textAnalysis(column)["word_frequencies"] for x in reversed(tuples[:cutoff]): values.append(x[0].decode("utf-8", "replace") if isinstance(x[0], basestring) else x[0]) counts.append(x[1]) else: tuples = column.value_counts() for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1): values.append(tuples.index[index].decode("utf-8", "replace") if isinstance(tuples.index[index], basestring) else tuples.index[index]) counts.append(tuples.iloc[index]) pyplot.style.use('ggplot') fig = pyplot.figure(figsize=(10, 8)) ax = fig.add_subplot(111) ax.set_ylim(-0.5, len(values) - 0.5) ax.barh(np.arange(len(values)), counts, tick_label=values, align="center") ax.set_xlabel("Frequency") ax.set_ylabel("Value") stream = StringIO() fig.savefig(stream, format="png", dpi=300) pyplot.close(fig) return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
def test_common_word_mode(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_mode_frequency"] == 4
def test_unique_words(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_unique_count"] == 5
def test_words_average(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_count_average"] == 1
def test_string_average(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_length_average"] == 4
def test_mode_count_entries(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["mode_frequency"] == 2
def test_total_words(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_total"] == 11
def test_words_average(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_count_average"] == 1
def test_words_max(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_count_max"] == 3
def test_string_average(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_length_average"] == 4
def test_string_max_length(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_length_max"] == 6
def test_mode_count_entries(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["mode_frequency"] == 2
def test_mode_entries(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["mode"] == ["random"]
def test_mode_entries(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["mode"] == ['random']
def test_unique_words(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_unique_count"] == 5
def test_string_max_length(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_length_max"] == 6
def test_common_word(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_mode"] == ["world"]
def test_words_max(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_count_max"] == 3
def test_common_word_mode(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_mode_frequency"] == 4
def test_total_words(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_total"] == 11
def test_word_frequency(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_frequencies"] == [("world", 4), ("hello", 3), ("random", 2), ("1", 1), ("2", 1)]
def test_common_word(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_mode"] == ['world']
def test_invalid_entries(): test_series = df["col2"] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["invalid"] == 2
def test_word_frequency(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["word_frequencies"] == [('world', 4), ('hello', 3), ('random', 2), ('1', 1), ('2', 1)]
def frequency(df, columnIndex, options={}): """Uses ``matplotlib`` to generate a horizontal frequency bar chart of the specified :class:`pandas.DataFrame` column This function uses the :meth:`pandas.Series.value_counts` method (or :func:`dcs.analyze.textAnalysis`['word_frequencies'] if plotting word frequency) to get the (value, frequency) tuples for the specified column. A horizontal bar chart is generated with the :func:`matplotlib.axes.Axes.barh` function, and the chart is exported to a PNG image and then encoded into a string using Base64. .. note:: The *options* kwarg can be used to customize the plot and may have the following key-value pairs: * **useWords** : a ``bool`` flag which may be set to ``True`` to plot word frequencies instad of row value frequencies for a string column * **cutoff** : an ``int`` specifying the top *n* values by frequency to plot, default is 50, maximum is 50 The function returns a dictionary with the following key-value pairs: * **image** : *StringIO.StringIO* – :class:`StringIO.StringIO` object containing Base64 encoded PNG image of generated plot Args: df (pandas.DataFrame): data frame columnIndices (list<int>): indices of columns to plot options (dict, optional): options dictionary Returns: dict: dictionary containing image """ cutoff = 50 useWords = False column = df[df.columns[columnIndex]] if type(options) is dict: if options.get("useWords", False) is True and not issubclass( column.dtype.type, np.datetime64) and not issubclass( column.dtype.type, np.number): useWords = True if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50: cutoff = int(options["cutoff"]) values = [] counts = [] if useWords: tuples = textAnalysis(column)["word_frequencies"] for x in reversed(tuples[:cutoff]): values.append(x[0].decode("utf-8", "replace") if isinstance( x[0], basestring) else x[0]) counts.append(x[1]) else: tuples = column.value_counts() for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1): values.append( tuples.index[index].decode("utf-8", "replace") if isinstance( tuples.index[index], basestring) else tuples.index[index]) counts.append(tuples.iloc[index]) pyplot.style.use('ggplot') fig = pyplot.figure(figsize=(10, 8)) ax = fig.add_subplot(111) ax.set_ylim(-0.5, len(values) - 0.5) ax.barh(np.arange(len(values)), counts, tick_label=values, align="center") ax.set_xlabel("Frequency") ax.set_ylabel("Value") stream = StringIO() fig.savefig(stream, format="png", dpi=300) pyplot.close(fig) return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
def frequency(df, columnIndex, options={}): """Uses ``matplotlib`` to generate a horizontal frequency bar chart of the specified :class:`pandas.DataFrame` column This function uses the :meth:`pandas.Series.value_counts` method (or :func:`dcs.analyze.textAnalysis`['word_frequencies'] if plotting word frequency) to get the (value, frequency) tuples for the specified column. A horizontal bar chart is generated with the :func:`matplotlib.axes.Axes.barh` function, and the chart is exported to a PNG image and then encoded into a string using Base64. .. note:: The *options* kwarg can be used to customize the plot and may have the following key-value pairs: * **useWords** : a ``bool`` flag which may be set to ``True`` to plot word frequencies instad of row value frequencies for a string column * **cutoff** : an ``int`` specifying the top *n* values by frequency to plot, default is 50, maximum is 50 The function returns a dictionary with the following key-value pairs: * **image** : *StringIO.StringIO* – :class:`StringIO.StringIO` object containing Base64 encoded PNG image of generated plot Args: df (pandas.DataFrame): data frame columnIndices (list<int>): indices of columns to plot options (dict, optional): options dictionary Returns: dict: dictionary containing image """ cutoff = 50 useWords = False column = df[df.columns[columnIndex]] if type(options) is dict: if options.get("useWords", False) is True and not issubclass(column.dtype.type, np.datetime64) and not issubclass(column.dtype.type, np.number): useWords = True if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50: cutoff = int(options["cutoff"]) values = [] counts = [] if useWords: tuples = textAnalysis(column)["word_frequencies"] for x in reversed(tuples[:cutoff]): values.append(x[0].decode("utf-8", "replace") if isinstance(x[0], basestring) else x[0]) counts.append(x[1]) else: tuples = column.value_counts() for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1): values.append(tuples.index[index].decode("utf-8", "replace") if isinstance(tuples.index[index], basestring) else tuples.index[index]) counts.append(tuples.iloc[index]) pyplot.style.use('ggplot') fig = pyplot.figure(figsize=(10, 8)) ax = fig.add_subplot(111) ax.set_ylim(-0.5, len(values) - 0.5) ax.barh(np.arange(len(values)), counts, tick_label=values, align="center") ax.set_xlabel("Frequency") ax.set_ylabel("Value") stream = StringIO() fig.savefig(stream, format="png", dpi=300) pyplot.close(fig) return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
def test_invalid_entries(): test_series = df['col2'] test_analysis = analyze.textAnalysis(test_series) assert test_analysis["invalid"] == 2