def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] categories = [] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) data['size'] = data['size'].astype(float) ratio = 0.1 + len(data['crawl'].unique()) * .03 print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts\n(before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] data = data[['crawl', 'percentage', 'type']] categories = [] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) ratio = 0.1 + len(data['crawl'].unique()) * .03 # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot_stacked_bar(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) data.replace(to_replace=value, value=replacement, inplace=True) data['size'] = data['size'].astype(float) print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts (before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p