def dataset_stat_latex_print(dataset_name): """ Print the avg precision, recall and F1 score in latex format to console. """ # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() # package results elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) result_list = [] for e in extractor_slugs: result_tuple = ( get_extractor_cls(e).NAME, txt_results.precision_statistics(e)[0], txt_results.recall_statistics(e)[0], txt_results.f1score_statistics(e)[0], ) result_list.append(result_tuple) result_list.sort(key=lambda i: i[3]) result_list.reverse() for r in result_list: print "\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline" % r
def dataset_stat_latex_print(dataset_name): ''' Print the avg precision, recall and F1 score in latex format to console. ''' # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() #package results elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) result_list = [] for e in extractor_slugs: result_tuple = ( get_extractor_cls(e).NAME, txt_results.precision_statistics(e)[0], txt_results.recall_statistics(e)[0], txt_results.f1score_statistics(e)[0], ) result_list.append(result_tuple) result_list.sort(key=lambda i: i[3]) result_list.reverse() for r in result_list: print '\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline' % r
def local_evaluate(dataset_type, dataset_name, update_ext_slug = None): results = TextBasedResults() if update_ext_slug: results.load(dataset_name) ex_cls = get_extractor_cls(update_ext_slug) single_evaluation(ex_cls, results, dataset_type, dataset_name) else: for extractor_cls in extractor_list: single_evaluation(extractor_cls, results, dataset_type, dataset_name) results.dataset_len = len(LocalDatasetLoader(dataset_name)) results.save(dataset_name) results.print_results()
def local_evaluate(dataset_type, dataset_name, update_ext_slug=None): results = TextBasedResults() if update_ext_slug: results.load(dataset_name) ex_cls = get_extractor_cls(update_ext_slug) single_evaluation(ex_cls, results, dataset_type, dataset_name) else: for extractor_cls in extractor_list: single_evaluation(extractor_cls, results, dataset_type, dataset_name) results.dataset_len = len(LocalDatasetLoader(dataset_name)) results.save(dataset_name) results.print_results()
def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing): # init storage and loader ex = get_extractor_cls(extractor_slug) failed_slug = extractor_slug if retry_failed else None skip_slug = extractor_slug if skip_existing else None loader = LocalDatasetLoader(dataset_name, load_failed=failed_slug, skip_existing=skip_slug) storage = LocalResultStorage(dataset_name, ex) logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME) for doc in loader: storage.push_result(doc) if timeout: time.sleep(timeout) storage.dump_summary() logger.info('finished with %s dataset', dataset_name)
def dataset_stat_plot(dataset_name, img_name): """ Plot the avg precision, recall and F1 score bar chart for the given dataset name. """ # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() # package results elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) packaged_data = ( ("Precision", [(txt_results.precision_statistics(e), e) for e in extractor_slugs]), ("Recall", [(txt_results.recall_statistics(e), e) for e in extractor_slugs]), ("F1 score", [(txt_results.f1score_statistics(e), e) for e in extractor_slugs]), ) bar_color = ("b", "c", "m") for i, pdata in enumerate(packaged_data): # package plotting values num_of_extractors = len(extractor_slugs) ind = np.arange(num_of_extractors) # the x locations for the groups width = 0.6 # the width of the bars result_list = pdata[1] result_list.sort(key=lambda i: i[0][0]) result_list.reverse() avg = [x[0][0] for x in result_list] stddev = [x[0][1] for x in result_list] # plot plt.subplot(3, 1, i + 1) plt.grid(True, alpha=0.5) rects_avg = plt.bar(ind, avg, width, color=bar_color[i], ecolor="g", yerr=stddev, linewidth=0.5, alpha=0.8) # lables and titles extractor_names = [get_extractor_cls(r[1]).NAME for r in result_list] plt.title(pdata[0]) plt.xticks(ind + width / 2.0, extractor_names, size="xx-small", rotation="vertical") plt.legend((rects_avg[0],), ("avg",), fancybox=True, prop=dict(size="x-small"), loc=4) # lower right for rect in rects_avg: height = rect.get_height() plt.text( rect.get_x() + rect.get_width() / 2.25, rect.get_height() + 0.01, "%1.2f" % height, ha="center", va="bottom", size="x-small", ) # subplots adjusting plt.subplots_adjust(wspace=0.5, hspace=0.9) # adjust figure height fig = plt.gcf() w, h = fig.get_size_inches() fig.set_size_inches(w, h * 1.6) # output out_path = os.path.join(settings.PATH_LOCAL_DATA, "plot-output", img_name) plt.savefig(out_path)
def dataset_contents_plot(dataset_name, img_name): """Plot the error case analysis.""" # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() # package data elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) package = [ ("|rel| = 0", "#9DFADE", [txt_results.result_contents(ex).rel_empty for ex in extractor_slugs]), ( "|rel intersect ret| = 0", "#3C70A3", [txt_results.result_contents(ex).rel_ret_empty for ex in extractor_slugs], ), ("|ret| = 0", "#5CCBED", [txt_results.result_contents(ex).ret_empty for ex in extractor_slugs]), ("mismatch", "#A76CF5", [txt_results.result_contents(ex).missmatch for ex in extractor_slugs]), ("failed", "#C43156", [txt_results.result_contents(ex).fail for ex in extractor_slugs]), ("successful", "#31C460", [txt_results.result_contents(ex).succ for ex in extractor_slugs]), ] num_of_extractors = len(extractor_slugs) ind = np.arange(num_of_extractors) # the x locations for the groups width = 0.6 fig = plt.gcf() fig.legend( [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package], [p[0] for p in package], fancybox=True, prop=dict(size="x-small"), ) # with successful instances ax1 = plt.subplot(121) bottom_y = np.zeros(num_of_extractors) for pdata in package: ax1.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor="g", linewidth=0.2, alpha=0.95) bottom_y += pdata[2] ax2 = plt.subplot(122) bottom_y = np.zeros(num_of_extractors) del package[-1] for pdata in package: ax2.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor="g", linewidth=0.2, alpha=0.95) bottom_y += pdata[2] # xticks labels extractor_names = [get_extractor_cls(e).NAME for e in extractor_slugs] ax1.set_xticks(ind + width / 2.0) ax1.set_xticklabels(extractor_names, size="xx-small", rotation="vertical") ax2.set_xticks(ind + width / 2.0) ax2.set_xticklabels(extractor_names, size="xx-small", rotation="vertical") # grid settings fig.suptitle("Boundary cases") ax1.grid(True, alpha=0.5) ax2.grid(True, alpha=0.5) # adjustment w, h = fig.get_size_inches() fig.set_size_inches(w * 1.5, h * 1.5) fig.subplots_adjust(bottom=0.2) # output out_path = os.path.join(settings.PATH_LOCAL_DATA, "plot-output", img_name) fig.savefig(out_path, bbox_inches="tight")
def dataset_stat_plot(dataset_name, img_name): ''' Plot the avg precision, recall and F1 score bar chart for the given dataset name. ''' # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() #package results elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) packaged_data = ( ('Precision', [(txt_results.precision_statistics(e), e) for e in extractor_slugs]), ('Recall', [(txt_results.recall_statistics(e), e) for e in extractor_slugs]), ('F1 score', [(txt_results.f1score_statistics(e), e) for e in extractor_slugs]), ) bar_color = ('b', 'c', 'm') for i, pdata in enumerate(packaged_data): # package plotting values num_of_extractors = len(extractor_slugs) ind = np.arange(num_of_extractors) # the x locations for the groups width = 0.6 # the width of the bars result_list = pdata[1] result_list.sort(key=lambda i: i[0][0]) result_list.reverse() avg = [x[0][0] for x in result_list] stddev = [x[0][1] for x in result_list] # plot plt.subplot(3, 1, i + 1) plt.grid(True, alpha=0.5) rects_avg = plt.bar(ind, avg, width, color=bar_color[i], ecolor='g', yerr=stddev, linewidth=0.5, alpha=0.8) # lables and titles extractor_names = [get_extractor_cls(r[1]).NAME for r in result_list] plt.title(pdata[0]) plt.xticks(ind + width / 2., extractor_names, size='xx-small', rotation='vertical') plt.legend( (rects_avg[0], ), ('avg', ), fancybox=True, prop=dict(size='x-small'), loc=4 # lower right ) for rect in rects_avg: height = rect.get_height() plt.text(rect.get_x() + rect.get_width() / 2.25, rect.get_height() + 0.01, '%1.2f' % height, ha='center', va='bottom', size='x-small') #subplots adjusting plt.subplots_adjust(wspace=0.5, hspace=0.9) #adjust figure height fig = plt.gcf() w, h = fig.get_size_inches() fig.set_size_inches(w, h * 1.6) # output out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name) plt.savefig(out_path)
def dataset_contents_plot(dataset_name, img_name): '''Plot the error case analysis.''' # get results txt_results = TextBasedResults() txt_results.load(dataset_name) txt_results.print_results() # package data elist = extractor_list_filter(txt_results.text_eval_results.keys()) extractor_slugs = tuple([e.SLUG for e in elist]) package = [ ('|rel| = 0', '#9DFADE', [txt_results.result_contents(ex).rel_empty for ex in extractor_slugs]), ('|rel intersect ret| = 0', '#3C70A3', [ txt_results.result_contents(ex).rel_ret_empty for ex in extractor_slugs ]), ('|ret| = 0', '#5CCBED', [txt_results.result_contents(ex).ret_empty for ex in extractor_slugs]), ('mismatch', '#A76CF5', [txt_results.result_contents(ex).missmatch for ex in extractor_slugs]), ('failed', '#C43156', [txt_results.result_contents(ex).fail for ex in extractor_slugs]), ('successful', '#31C460', [txt_results.result_contents(ex).succ for ex in extractor_slugs]), ] num_of_extractors = len(extractor_slugs) ind = np.arange(num_of_extractors) # the x locations for the groups width = 0.6 fig = plt.gcf() fig.legend( [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package], [p[0] for p in package], fancybox=True, prop=dict(size='x-small'), ) # with successful instances ax1 = plt.subplot(121) bottom_y = np.zeros(num_of_extractors) for pdata in package: ax1.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor='g', linewidth=0.2, alpha=0.95) bottom_y += pdata[2] ax2 = plt.subplot(122) bottom_y = np.zeros(num_of_extractors) del package[-1] for pdata in package: ax2.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor='g', linewidth=0.2, alpha=0.95) bottom_y += pdata[2] # xticks labels extractor_names = [get_extractor_cls(e).NAME for e in extractor_slugs] ax1.set_xticks(ind + width / 2.) ax1.set_xticklabels(extractor_names, size='xx-small', rotation='vertical') ax2.set_xticks(ind + width / 2.) ax2.set_xticklabels(extractor_names, size='xx-small', rotation='vertical') # grid settings fig.suptitle('Boundary cases') ax1.grid(True, alpha=0.5) ax2.grid(True, alpha=0.5) # adjustment w, h = fig.get_size_inches() fig.set_size_inches(w * 1.5, h * 1.5) fig.subplots_adjust(bottom=0.2) # output out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name) fig.savefig(out_path, bbox_inches='tight')