def parse_master(localpath, remoteurl, page_data=None): """Part of the new pipeline to store individual rows rather than whole pages of html. Parses the master data into a set of rows, and writes them out to the datastore in an easily retrievable format. Doesn't modify page_data dict. """ ts = datetime.datetime.now() page_data = page_data or {} content = page_data.get('content') if not content: return page_data content = content.decode('utf-8', 'replace') # Split page into surroundings (announce, legend, footer) and data (rows). surroundings = BeautifulSoup(content) data = surroundings.find('table', 'ConsoleData') if data is None: raise Exception('parse_master: data can not be None') new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'), ('width', '96%')]) data.replaceWith(new_data) surroundings_page = get_or_create_page(localpath + '/surroundings', None, maxage=30) surroundings_data = {} surroundings_data['title'] = 'Surroundings for ' + localpath surroundings_data['content'] = utf8_convert(surroundings) save_page(surroundings_page, localpath + '/surroundings', ts, surroundings_data) rows = data.findAll('tr', recursive=False) # The first table row can be special: the list of categories. categories = None # If the first row contains a DevStatus cell... if rows[0].find('td', 'DevStatus') != None: # ...extract it into the categories... categories = rows[0] # ...and get rid of the next (spacer) row too. rows = rows[2:] if categories: category_page = get_or_create_page(localpath + '/categories', None, maxage=30) category_data = {} category_data['title'] = 'Categories for ' + localpath category_data['content'] = utf8_convert(categories) save_page(category_page, localpath + '/categories', ts, category_data) # The next table row is special, it's the summary one-box-per-builder. summary = rows[0] rows = rows[1:] summary_page = get_or_create_page(localpath + '/summary', None, maxage=30) summary_data = {} summary_data['title'] = 'Summary for ' + localpath summary_data['content'] = utf8_convert(summary) save_page(summary_page, localpath + '/summary', ts, summary_data) curr_row = {} # Each table row is either a status row with a revision, name, and status, # a comment row with the commit message, a details row with flakiness info, # or a spacer row (in which case we finalize the row and save it). for row in rows: if row.find('td', 'DevComment'): curr_row['comment'] = ''.join(utf8_convert(tag).strip() for tag in row.td.contents) elif row.find('td', 'DevDetails'): curr_row['details'] = ''.join(utf8_convert(tag).strip() for tag in row.td.contents) elif row.find('td', 'DevStatus'): curr_row['rev'] = ''.join(utf8_convert(tag).strip() for tag in row.find('td', 'DevRev').contents) curr_row['name'] = ''.join(utf8_convert(tag).strip() for tag in row.find('td', 'DevName').contents) curr_row['status'] = ''.join(utf8_convert(box.table).strip() for box in row.findAll('td', 'DevStatus')) else: if 'details' not in curr_row: curr_row['details'] = '' curr_row['fetch_timestamp'] = ts curr_row['rev_number'] = get_position_number(curr_row['comment']) save_row(curr_row, localpath + '/' + curr_row['rev_number']) curr_row = {} return page_data
def console_merger(localpath, remoteurl, page_data, masters_to_merge=None, num_rows_to_merge=None): masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE num_rows_to_merge = num_rows_to_merge or 25 console_data = ConsoleData() surroundings = get_and_cache_pagedata( '%s/console/surroundings' % masters_to_merge[0]) merged_page = BeautifulSoup(surroundings['content']) merged_tag = merged_page.find('table', 'ConsoleData') if merged_tag is None: msg = 'console_merger("%s", "%s", "%s"): merged_tag cannot be None.' % ( localpath, remoteurl, page_data) logging.error(msg) raise Exception(msg) latest_rev = int(get_and_cache_rowdata('latest_rev')['rev_number']) if not latest_rev: logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest ' 'revision number.' % ( localpath, remoteurl, page_data)) return fetch_timestamp = datetime.datetime.now() for master in masters_to_merge: # Fetch the summary one-box-per-builder for the master. # If we don't get it, something is wrong, skip the master entirely. master_summary = get_and_cache_pagedata('%s/console/summary' % master) if not master_summary['content']: continue console_data.SawMaster(master) # Get the categories for this builder. If the builder doesn't have any # categories, just use the default empty-string category. category_list = [] master_categories = get_and_cache_pagedata('%s/console/categories' % master) if not master_categories['content']: category_list.append('') else: category_row = BeautifulSoup(master_categories['content']) category_list = [c.text for c in category_row.findAll('td', 'DevStatus')] # Get the corresponding summary box(es). summary_row = BeautifulSoup(master_summary['content']) summary_list = summary_row.findAll('table') for category, summary in zip(category_list, summary_list): console_data.AddCategory(category, summary) # Fetch all of the rows that we need. rows_fetched = 0 revs_skipped = 0 current_rev = latest_rev while rows_fetched < num_rows_to_merge and current_rev >= 0: # Don't get stuck looping backwards forever into data we don't have. # How hard we try scales with how many rows the person wants. if revs_skipped > max(num_rows_to_merge, 10): break row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev)) if not row_data: current_rev -= 1 revs_skipped += 1 continue console_data.AddRow(row_data) current_rev -= 1 revs_skipped = 0 rows_fetched += 1 # Convert the merged content into console content. console_data.Finish() template_environment = Environment() template_environment.loader = FileSystemLoader('.') def notstarted(builder_status): """Convert a BeautifulSoup Tag from builder status to a notstarted line.""" builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status)) builder_status = re.sub(r'class=\'([^\']*)\' target=', 'class=\'DevStatusBox notstarted\' target=', builder_status) builder_status = re.sub(r'class="([^"]*)" target=', 'class="DevStatusBox notstarted" target=', builder_status) return builder_status template_environment.filters['notstarted'] = notstarted merged_template = template_environment.from_string(console_template) merged_console = merged_template.render(data=console_data) # For debugging: # logging.info('%r' % merged_console) # import code # code.interact(local=locals()) # Place merged console at |merged_tag|'s location in |merged_page|, and put # the result in |merged_content|. merged_tag.replaceWith(merged_console) merged_content = utf8_convert(merged_page) merged_content = re.sub( r'\'\<a href="\'', '\'<a \' + attributes + \' href="\'', merged_content) merged_content = re.sub( r'\'\<table\>\'', r"'<table ' + attributes + '>'", merged_content) merged_content = re.sub( r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content) merged_content = re.sub( r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content) merged_content = re.sub( r'\<iframe\>\</iframe\>', '<iframe \' + attributes + \' src="\' + url + \'"></iframe>', merged_content) # Update the merged console page. merged_page = get_or_create_page(localpath, None, maxage=30) logging.info('console_merger: saving merged console') page_data = get_and_cache_pagedata(localpath) page_data['title'] = 'BuildBot: Chromium' page_data['offsite_base'] = 'http://build.chromium.org/p/chromium' page_data['body_class'] = 'interface' page_data['content'] = merged_content save_page(merged_page, localpath, fetch_timestamp, page_data) return