def get_shingles(self, input_text, prefix=None): """Return a vector of shingles from a source text :param input_text: Input sequence :type input_text: collections.Iterable :param prefix: an object to prepend to token sequence :type prefix: object :return: A set of shingles (tuples) :rtype: set, list """ normalizer = self._normalizer text = input_text \ if normalizer is None \ else normalizer.normalize(input_text) tokens = text if isiterable(text) else self._tokenize(text) span = self._span unique = self._unique kmin = self._kmin if not unique and kmin > 0: # cycle tokens until we can take kmin shingles token_count = len(tokens) prefix_token_count = 0 if prefix is None else 1 num_shingles = token_count - span + prefix_token_count + 1 append_num = kmin - num_shingles if append_num > 0: tokens = take(token_count + append_num, cycle(tokens)) final_it = tokens if prefix is None else chain([prefix], tokens) shingles = self._shinglify(final_it, span, skip=self._skip) result = set(shingles) if unique else list(shingles) return result
def create_plots(args, df): import jinja2 import matplotlib.pyplot as plt from palettable import colorbrewer from matplotlib.font_manager import FontProperties fontP = FontProperties() fontP.set_size("xx-small") # groups = df.set_index(args.x_axis).groupby([args.group_by]) groups = df.groupby([args.group_by]) metrics = list(set(args.metrics) & set(df.keys())) colors = take( len(metrics), cycle(chain(colorbrewer.qualitative.Dark2_8.mpl_colors, colorbrewer.qualitative.Set2_8.mpl_colors)), ) template_loader = jinja2.FileSystemLoader(os.path.join(args.output, "..")) template_env = jinja2.Environment(loader=template_loader) template_interactive = template_env.get_template("template_fig_interactive.html") template_static = template_env.get_template("template_fig_static.html") table_interactive = [] table_static = [] for group_name, group in groups: # always sort by X values group = group.sort([args.x_axis]) if args.fig_title is None: fig_title = "%s=%s" % (args.group_by, group_name) else: fig_title = args.fig_title # compute AUC scores ys = [] for metric, color in zip(metrics, colors): series = group[metric] score = auc_xscaled(group[args.x_axis].values, series.values) label = "%s (%.4f)" % (metric, score) ys.append((score, metric, label, color)) ys.sort(reverse=True) lbls_old, lbls_new, colors = zip(*ys)[1:4] group = ( group[[args.x_axis] + list(lbls_old)].set_index(args.x_axis).rename(columns=dict(zip(lbls_old, lbls_new))) ) # create plots fig, ax = plt.subplots() group.plot(ax=ax, title=fig_title, color=list(colors)) ax.set_xlim(*minmaxr(group.index.values)) ax.set_ylim(0.4, 1.0) ax.legend(loc=args.legend_loc, prop=fontP) fig_name = "fig-%s.%s" % (group_name, args.fig_format) fig_path = os.path.join(args.output, fig_name) csv_name = "fig-%s.csv" % group_name csv_path = os.path.join(args.output, csv_name) group.to_csv(csv_path) table_interactive.append((csv_name, args.x_axis, "%s=%s" % (args.group_by, group_name))) table_static.append(fig_name) fig.savefig(fig_path, format=args.fig_format) plt.close(fig) with open(os.path.join(args.output, "fig_interactive.html"), "w") as fh: fh.write(template_interactive.render(table=table_interactive)) with open(os.path.join(args.output, "fig_static.html"), "w") as fh: fh.write(template_static.render(table=table_static))
cat_filter=cat_filter) data = dataset.data[:n_samples] samples = data elif args.input is not None: if args.ground_tag is not None: get_ground_truth = partial(has_common_tags, TAG_MAP[args.ground_tag]) elif args.ground_attr is not None: get_ground_truth = ATTR_MAP[args.ground_attr] else: raise ValueError("neither ground_tag nor ground_attr specified") with open(args.input, 'r') as fh: dataset = imap(json.loads, fh) data = take(n_samples, dataset) samples = [s['object'] for s in data] else: raise ValueError("No input sources specified.") if n_samples == float('inf'): n_samples = len(data) assert n_samples >= 2 if n_topics is None: n_topics = 2 if args.n_features is None: n_features = n_topics + 1
def create_plots(args, df): import jinja2 import matplotlib.pyplot as plt from palettable import colorbrewer from matplotlib.font_manager import FontProperties fontP = FontProperties() fontP.set_size('xx-small') #groups = df.set_index(args.x_axis).groupby([args.group_by]) groups = df.groupby([args.group_by]) metrics = list(set(args.metrics) & set(df.keys())) colors = take( len(metrics), cycle( chain( colorbrewer.qualitative.Dark2_8.mpl_colors, colorbrewer.qualitative.Set2_8.mpl_colors, ))) template_loader = jinja2.FileSystemLoader(os.path.join(args.output, '..')) template_env = jinja2.Environment(loader=template_loader) template_interactive = template_env.get_template( 'template_fig_interactive.html') template_static = template_env.get_template('template_fig_static.html') table_interactive = [] table_static = [] for group_name, group in groups: # always sort by X values group = group.sort([args.x_axis]) if args.fig_title is None: fig_title = '%s=%s' % (args.group_by, group_name) else: fig_title = args.fig_title # compute AUC scores ys = [] for metric, color in zip(metrics, colors): series = group[metric] score = auc_xscaled(group[args.x_axis].values, series.values) label = "%s (%.4f)" % (metric, score) ys.append((score, metric, label, color)) ys.sort(reverse=True) lbls_old, lbls_new, colors = zip(*ys)[1:4] group = group[[args.x_axis] + list(lbls_old)] \ .set_index(args.x_axis) \ .rename(columns=dict(zip(lbls_old, lbls_new))) # create plots fig, ax = plt.subplots() group.plot(ax=ax, title=fig_title, color=list(colors)) ax.set_xlim(*minmaxr(group.index.values)) ax.set_ylim(0.4, 1.0) ax.legend(loc=args.legend_loc, prop=fontP) fig_name = 'fig-%s.%s' % (group_name, args.fig_format) fig_path = os.path.join(args.output, fig_name) csv_name = 'fig-%s.csv' % group_name csv_path = os.path.join(args.output, csv_name) group.to_csv(csv_path) table_interactive.append(( csv_name, args.x_axis, "%s=%s" % (args.group_by, group_name), )) table_static.append(fig_name) fig.savefig(fig_path, format=args.fig_format) plt.close(fig) with open(os.path.join(args.output, 'fig_interactive.html'), 'w') as fh: fh.write(template_interactive.render(table=table_interactive)) with open(os.path.join(args.output, 'fig_static.html'), 'w') as fh: fh.write(template_static.render(table=table_static))