Python BeautifulSoup.find示例

编程语言: Python

命名空间/包名称: third_party.BeautifulSoup.BeautifulSoup

类/类型: BeautifulSoup

方法/功能: find

hotexamples.com的示例: 4

Python BeautifulSoup.find - 已找到4个示例。这些是从开源项目中提取的最受好评的third_party.BeautifulSoup.BeautifulSoup.BeautifulSoup.find现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

BeautifulSoup(9)

findAll(4)

find(2)

示例#1

显示文件

文件： app.py 项目： xinghun61/infra

def parse_master(localpath, remoteurl, page_data=None):
  """Part of the new pipeline to store individual rows rather than
  whole pages of html. Parses the master data into a set of rows,
  and writes them out to the datastore in an easily retrievable format.

  Doesn't modify page_data dict.
  """
  ts = datetime.datetime.now()
  page_data = page_data or {}
  content = page_data.get('content')
  if not content:
    return page_data
  content = content.decode('utf-8', 'replace')

  # Split page into surroundings (announce, legend, footer) and data (rows).
  surroundings = BeautifulSoup(content)
  data = surroundings.find('table', 'ConsoleData')
  if data is None:
    raise Exception('parse_master: data can not be None')
  new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'),
                                         ('width', '96%')])
  data.replaceWith(new_data)

  surroundings_page = get_or_create_page(localpath + '/surroundings',
                                         None, maxage=30)
  surroundings_data = {}
  surroundings_data['title'] = 'Surroundings for ' + localpath
  surroundings_data['content'] = utf8_convert(surroundings)
  save_page(surroundings_page, localpath + '/surroundings', ts,
            surroundings_data)

  rows = data.findAll('tr', recursive=False)
  # The first table row can be special: the list of categories.
  categories = None
  # If the first row contains a DevStatus cell...
  if rows[0].find('td', 'DevStatus') != None:
    # ...extract it into the categories...
    categories = rows[0]
    # ...and get rid of the next (spacer) row too.
    rows = rows[2:]

  if categories:
    category_page = get_or_create_page(localpath + '/categories',
                                       None, maxage=30)
    category_data = {}
    category_data['title'] = 'Categories for ' + localpath
    category_data['content'] = utf8_convert(categories)
    save_page(category_page, localpath + '/categories', ts, category_data)

  # The next table row is special, it's the summary one-box-per-builder.
  summary = rows[0]
  rows = rows[1:]

  summary_page = get_or_create_page(localpath + '/summary', None, maxage=30)
  summary_data = {}
  summary_data['title'] = 'Summary for ' + localpath
  summary_data['content'] = utf8_convert(summary)
  save_page(summary_page, localpath + '/summary', ts, summary_data)

  curr_row = {}
  # Each table row is either a status row with a revision, name, and status,
  # a comment row with the commit message, a details row with flakiness info,
  # or a spacer row (in which case we finalize the row and save it).
  for row in rows:
    if row.find('td', 'DevComment'):
      curr_row['comment'] = ''.join(utf8_convert(tag).strip()
                                    for tag in row.td.contents)
    elif row.find('td', 'DevDetails'):
      curr_row['details'] = ''.join(utf8_convert(tag).strip()
                                    for tag in row.td.contents)
    elif row.find('td', 'DevStatus'):
      curr_row['rev'] = ''.join(utf8_convert(tag).strip()
                                for tag in row.find('td', 'DevRev').contents)
      curr_row['name'] = ''.join(utf8_convert(tag).strip()
                                 for tag in row.find('td', 'DevName').contents)
      curr_row['status'] = ''.join(utf8_convert(box.table).strip()
                                   for box in row.findAll('td', 'DevStatus'))
    else:
      if 'details' not in curr_row:
        curr_row['details'] = ''
      curr_row['fetch_timestamp'] = ts
      curr_row['rev_number'] = get_position_number(curr_row['comment'])
      save_row(curr_row, localpath + '/' + curr_row['rev_number'])
      curr_row = {}

  return page_data

示例#2

显示文件

文件： app.py 项目： nicko96/Chrome-Infra

def parse_master(localpath, remoteurl, page_data=None):
  """Part of the new pipeline to store individual rows rather than
  whole pages of html. Parses the master data into a set of rows,
  and writes them out to the datastore in an easily retrievable format.

  Doesn't modify page_data dict.
  """
  ts = datetime.datetime.now()
  page_data = page_data or {}
  content = page_data.get('content')
  if not content:
    return page_data
  content = content.decode('utf-8', 'replace')

  # Split page into surroundings (announce, legend, footer) and data (rows).
  surroundings = BeautifulSoup(content)
  data = surroundings.find('table', 'ConsoleData')
  if data is None:
    raise Exception('parse_master: data can not be None')
  new_data = Tag(surroundings, 'table', [('class', 'ConsoleData'),
                                         ('width', '96%')])
  data.replaceWith(new_data)

  surroundings_page = get_or_create_page(localpath + '/surroundings',
                                         None, maxage=30)
  surroundings_data = {}
  surroundings_data['title'] = 'Surroundings for ' + localpath
  surroundings_data['content'] = utf8_convert(surroundings)
  save_page(surroundings_page, localpath + '/surroundings', ts,
            surroundings_data)

  rows = data.findAll('tr', recursive=False)
  # The first table row can be special: the list of categories.
  categories = None
  # If the first row contains a DevStatus cell...
  if rows[0].find('td', 'DevStatus') != None:
    # ...extract it into the categories...
    categories = rows[0]
    # ...and get rid of the next (spacer) row too.
    rows = rows[2:]

  if categories:
    category_page = get_or_create_page(localpath + '/categories',
                                       None, maxage=30)
    category_data = {}
    category_data['title'] = 'Categories for ' + localpath
    category_data['content'] = utf8_convert(categories)
    save_page(category_page, localpath + '/categories', ts, category_data)

  # The next table row is special, it's the summary one-box-per-builder.
  summary = rows[0]
  rows = rows[1:]

  summary_page = get_or_create_page(localpath + '/summary', None, maxage=30)
  summary_data = {}
  summary_data['title'] = 'Summary for ' + localpath
  summary_data['content'] = utf8_convert(summary)
  save_page(summary_page, localpath + '/summary', ts, summary_data)

  curr_row = {}
  # Each table row is either a status row with a revision, name, and status,
  # a comment row with the commit message, a details row with flakiness info,
  # or a spacer row (in which case we finalize the row and save it).
  for row in rows:
    if row.find('td', 'DevComment'):
      curr_row['comment'] = ''.join(utf8_convert(tag).strip()
                                    for tag in row.td.contents)
    elif row.find('td', 'DevDetails'):
      curr_row['details'] = ''.join(utf8_convert(tag).strip()
                                    for tag in row.td.contents)
    elif row.find('td', 'DevStatus'):
      curr_row['rev'] = ''.join(utf8_convert(tag).strip()
                                for tag in row.find('td', 'DevRev').contents)
      curr_row['name'] = ''.join(utf8_convert(tag).strip()
                                 for tag in row.find('td', 'DevName').contents)
      curr_row['status'] = ''.join(utf8_convert(box.table).strip()
                                   for box in row.findAll('td', 'DevStatus'))
    else:
      if 'details' not in curr_row:
        curr_row['details'] = ''
      curr_row['fetch_timestamp'] = ts
      curr_row['rev_number'] = get_position_number(curr_row['comment'])
      save_row(curr_row, localpath + '/' + curr_row['rev_number'])
      curr_row = {}

  return page_data

示例#3

显示文件

文件： app.py 项目： xinghun61/infra

def console_merger(localpath, remoteurl, page_data,
                   masters_to_merge=None, num_rows_to_merge=None):
  masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE
  num_rows_to_merge = num_rows_to_merge or 25
  console_data = ConsoleData()
  surroundings = get_and_cache_pagedata(
                     '%s/console/surroundings' % masters_to_merge[0])
  merged_page = BeautifulSoup(surroundings['content'])
  merged_tag = merged_page.find('table', 'ConsoleData')
  if merged_tag is None:
    msg = 'console_merger("%s", "%s", "%s"): merged_tag cannot be None.' % (
          localpath, remoteurl, page_data)
    logging.error(msg)
    raise Exception(msg)
  latest_rev = int(get_and_cache_rowdata('latest_rev')['rev_number'])
  if not latest_rev:
    logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest '
                  'revision number.' % (
                      localpath, remoteurl, page_data))
    return
  fetch_timestamp = datetime.datetime.now()
  for master in masters_to_merge:
    # Fetch the summary one-box-per-builder for the master.
    # If we don't get it, something is wrong, skip the master entirely.
    master_summary = get_and_cache_pagedata('%s/console/summary' % master)
    if not master_summary['content']:
      continue
    console_data.SawMaster(master)
    # Get the categories for this builder. If the builder doesn't have any
    # categories, just use the default empty-string category.
    category_list = []
    master_categories = get_and_cache_pagedata('%s/console/categories' % master)
    if not master_categories['content']:
      category_list.append('')
    else:
      category_row = BeautifulSoup(master_categories['content'])
      category_list = [c.text for c in category_row.findAll('td', 'DevStatus')]
    # Get the corresponding summary box(es).
    summary_row = BeautifulSoup(master_summary['content'])
    summary_list = summary_row.findAll('table')
    for category, summary in zip(category_list, summary_list):
      console_data.AddCategory(category, summary)

    # Fetch all of the rows that we need.
    rows_fetched = 0
    revs_skipped = 0
    current_rev = latest_rev
    while rows_fetched < num_rows_to_merge and current_rev >= 0:
      # Don't get stuck looping backwards forever into data we don't have.
      # How hard we try scales with how many rows the person wants.
      if revs_skipped > max(num_rows_to_merge, 10):
        break
      row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev))
      if not row_data:
        current_rev -= 1
        revs_skipped += 1
        continue
      console_data.AddRow(row_data)
      current_rev -= 1
      revs_skipped = 0
      rows_fetched += 1

  # Convert the merged content into console content.
  console_data.Finish()
  template_environment = Environment()
  template_environment.loader = FileSystemLoader('.')
  def notstarted(builder_status):
    """Convert a BeautifulSoup Tag from builder status to a notstarted line."""
    builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status))
    builder_status = re.sub(r'class=\'([^\']*)\' target=',
                            'class=\'DevStatusBox notstarted\' target=',
                            builder_status)
    builder_status = re.sub(r'class="([^"]*)" target=',
                            'class="DevStatusBox notstarted" target=',
                            builder_status)
    return builder_status
  template_environment.filters['notstarted'] = notstarted
  merged_template = template_environment.from_string(console_template)
  merged_console = merged_template.render(data=console_data)
  # For debugging:
  # logging.info('%r' % merged_console)
  # import code
  # code.interact(local=locals())

  # Place merged console at |merged_tag|'s location in |merged_page|, and put
  # the result in |merged_content|.
  merged_tag.replaceWith(merged_console)
  merged_content = utf8_convert(merged_page)
  merged_content = re.sub(
      r'\'\<a href="\'', '\'<a \' + attributes + \' href="\'', merged_content)
  merged_content = re.sub(
      r'\'\<table\>\'', r"'<table ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\<iframe\>\</iframe\>',
      '<iframe \' + attributes + \' src="\' + url + \'"></iframe>',
      merged_content)

  # Update the merged console page.
  merged_page = get_or_create_page(localpath, None, maxage=30)
  logging.info('console_merger: saving merged console')
  page_data = get_and_cache_pagedata(localpath)
  page_data['title'] = 'BuildBot: Chromium'
  page_data['offsite_base'] = 'http://build.chromium.org/p/chromium'
  page_data['body_class'] = 'interface'
  page_data['content'] = merged_content
  save_page(merged_page, localpath, fetch_timestamp, page_data)
  return

示例#4

显示文件

文件： app.py 项目： nicko96/Chrome-Infra

def console_merger(localpath, remoteurl, page_data,
                   masters_to_merge=None, num_rows_to_merge=None):
  masters_to_merge = masters_to_merge or DEFAULT_MASTERS_TO_MERGE
  num_rows_to_merge = num_rows_to_merge or 25
  console_data = ConsoleData()
  surroundings = get_and_cache_pagedata(
                     '%s/console/surroundings' % masters_to_merge[0])
  merged_page = BeautifulSoup(surroundings['content'])
  merged_tag = merged_page.find('table', 'ConsoleData')
  if merged_tag is None:
    msg = 'console_merger("%s", "%s", "%s"): merged_tag cannot be None.' % (
          localpath, remoteurl, page_data)
    logging.error(msg)
    raise Exception(msg)
  latest_rev = int(get_and_cache_rowdata('latest_rev')['rev_number'])
  if not latest_rev:
    logging.error('console_merger(\'%s\', \'%s\', \'%s\'): cannot get latest '
                  'revision number.' % (
                      localpath, remoteurl, page_data))
    return
  fetch_timestamp = datetime.datetime.now()
  for master in masters_to_merge:
    # Fetch the summary one-box-per-builder for the master.
    # If we don't get it, something is wrong, skip the master entirely.
    master_summary = get_and_cache_pagedata('%s/console/summary' % master)
    if not master_summary['content']:
      continue
    console_data.SawMaster(master)
    # Get the categories for this builder. If the builder doesn't have any
    # categories, just use the default empty-string category.
    category_list = []
    master_categories = get_and_cache_pagedata('%s/console/categories' % master)
    if not master_categories['content']:
      category_list.append('')
    else:
      category_row = BeautifulSoup(master_categories['content'])
      category_list = [c.text for c in category_row.findAll('td', 'DevStatus')]
    # Get the corresponding summary box(es).
    summary_row = BeautifulSoup(master_summary['content'])
    summary_list = summary_row.findAll('table')
    for category, summary in zip(category_list, summary_list):
      console_data.AddCategory(category, summary)

    # Fetch all of the rows that we need.
    rows_fetched = 0
    revs_skipped = 0
    current_rev = latest_rev
    while rows_fetched < num_rows_to_merge and current_rev >= 0:
      # Don't get stuck looping backwards forever into data we don't have.
      # How hard we try scales with how many rows the person wants.
      if revs_skipped > max(num_rows_to_merge, 10):
        break
      row_data = get_and_cache_rowdata('%s/console/%s' % (master, current_rev))
      if not row_data:
        current_rev -= 1
        revs_skipped += 1
        continue
      console_data.AddRow(row_data)
      current_rev -= 1
      revs_skipped = 0
      rows_fetched += 1

  # Convert the merged content into console content.
  console_data.Finish()
  template_environment = Environment()
  template_environment.loader = FileSystemLoader('.')
  def notstarted(builder_status):
    """Convert a BeautifulSoup Tag from builder status to a notstarted line."""
    builder_status = re.sub(r'DevSlaveBox', 'DevStatusBox', str(builder_status))
    builder_status = re.sub(r'class=\'([^\']*)\' target=',
                            'class=\'DevStatusBox notstarted\' target=',
                            builder_status)
    builder_status = re.sub(r'class="([^"]*)" target=',
                            'class="DevStatusBox notstarted" target=',
                            builder_status)
    return builder_status
  template_environment.filters['notstarted'] = notstarted
  merged_template = template_environment.from_string(console_template)
  merged_console = merged_template.render(data=console_data)
  # For debugging:
  # logging.info('%r' % merged_console)
  # import code
  # code.interact(local=locals())

  # Place merged console at |merged_tag|'s location in |merged_page|, and put
  # the result in |merged_content|.
  merged_tag.replaceWith(merged_console)
  merged_content = utf8_convert(merged_page)
  merged_content = re.sub(
      r'\'\<a href="\'', '\'<a \' + attributes + \' href="\'', merged_content)
  merged_content = re.sub(
      r'\'\<table\>\'', r"'<table ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\'\<div\>\'', r"'<div ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\'\<td\>\'', r"'<td ' + attributes + '>'", merged_content)
  merged_content = re.sub(
      r'\<iframe\>\</iframe\>',
      '<iframe \' + attributes + \' src="\' + url + \'"></iframe>',
      merged_content)

  # Update the merged console page.
  merged_page = get_or_create_page(localpath, None, maxage=30)
  logging.info('console_merger: saving merged console')
  page_data = get_and_cache_pagedata(localpath)
  page_data['title'] = 'BuildBot: Chromium'
  page_data['offsite_base'] = 'http://build.chromium.org/p/chromium'
  page_data['body_class'] = 'interface'
  page_data['content'] = merged_content
  save_page(merged_page, localpath, fetch_timestamp, page_data)
  return