def findMetaHttpEquiv(first_block):
    fp = StringIO.StringIO(first_block)
    try:
        for token in generator_parser.generate_tokens(fp):
            if token[:2] == (generator_parser.TAG, 'meta'):
                http_equiv = _getvalue(token[2],'http-equiv').lower()
                if http_equiv == 'content-type':
                    attrs = token[2]
                    return findCharSet(_getvalue(attrs,'content'))
    except sgmllib.SGMLParseError, e:
        log.warn('Error looking for <meta> encoding "%s"', str(e))   # ParseError not uncommon, just log
    def test_declaration_incomplete(self):

        # verify that the lenient declaration can handle incompete tags

        doc = " <html>A<!-- bad comment -->B</html>"

        # Note unrelated problem: without the initial space above, there
        # is problem in parsing the incomplete <html>. Investigate?!

        for i in range(1, len(doc)-1):

            chunks = [doc[:i], doc[i:]]
            #print chunks
            fp = ChunkedStringIO(chunks)
            tokens = gp.generate_tokens(fp)

            self._test_generator1(
                tokens,
                [
                (gp.TAG,    u'html', []),
                (gp.DATA,   u'A'       ),
                (gp.DATA,   u'B'       ),
                (gp.ENDTAG, u'html'),
                ])
def process(fp, out, meta):
    """ Return has_html, has_frameset """

    has_html        = False
    has_frameset    = False
    has_common_tag  = False

    first_td    = False     # state for iterating td inside tr

    iterator = generator_parser.generate_tokens(fp)

    # General HTML format
    # <html>
    #   <head>
    #   <body>
    #
    # However all elements are optional.
    # It is better to use a flat, stateless loop to process elements

    for token in iterator:

        if token[0] == DATA:
            out.out(token[1])

        elif token[0] == TAG:

            tag = token[1]
            id = starttag_dict.get(tag,-1)

            if id > 0:
                has_common_tag = True

            if id == sOUTP:
                out.outTag('p')

            elif id == sOUTTAG:
                out.outTag(tag)

            elif id == sTR:
                first_td = True

            elif id == sTDTH:
                if first_td:
                    first_td = False
                else:
                    out.out('   ')

            elif id == sINPUT:

                attrs = token[2]
                itype = _getvalue(attrs, 'type')

                if itype == 'checkbox':
                    if _hasattr(attrs,'checked'):
                        out.out('[*] ')
                    else:
                        out.out('[ ] ')

                elif itype == 'radio':
                    if _hasattr(attrs,'checked'):
                        out.out('(*) ')
                    else:
                        out.out('( ) ')

                elif itype == 'image':
                    alt = _getvalue(attrs, 'alt') or _getvalue(attrs, 'value')
                    out.outAlt(saxutils.unescape(alt))

                elif itype == 'password':
                    out.outAlt('***')

                elif itype == 'hidden':
                    pass

                else:
                    value = _getvalue(attrs, 'value')
                    out.outAlt(saxutils.unescape(value))

            elif id == sIMG:
                attrs = token[2]
                alt = _getvalue(attrs, 'alt')
                if alt:
                    out.outAlt(saxutils.unescape(alt))

            elif id == sHTML:
                has_html = True
                out.notifyHtml()

            elif id == sBODY:
                out.outHeader(meta)

            elif id == sFRAMESET:
                has_frameset = True

            elif id == sTITLE:
                title = ''
                for token in iterator:
                    if token[0] == DATA:
                        title += token[1]
                    elif token in [
                        (ENDTAG, 'title'),  # only </title> is valid
                        (ENDTAG, 'head'),   # in case no </title>
                        (TAG, 'body'),      # in case no </title>
                        ]:
                        break
                meta['title'] = _collapse(title)

            elif id == sMETA:
                attrs = token[2]
                name = _getvalue(attrs,'name').lower()
                content = _getvalue(attrs,'content')
                if name == 'description':
                    meta['description'] = saxutils.unescape(_collapse(content))
                elif name == 'keywords':
                    meta['keywords'] = saxutils.unescape(_collapse(content))

            elif id == sSCRIPT:
                for token in iterator:
                    if token == (ENDTAG, 'script'):
                        break

            elif id == sSTYLE:
                for token in iterator:
                    if token == (ENDTAG, 'style'):
                        break

            elif id == sSELECT:
                for token in iterator:
                    if token == (ENDTAG, 'select'):
                        break


        elif token[0] == ENDTAG:

            tag = token[1]
            id = endtag_dict.get(tag,-1)

            if id == eCLOSE_TAG:
                out.outTag('/'+tag)

            elif id == eBREAK_LINE:
                out.outTag('br')

    out.close(meta)

    return has_html, has_frameset, has_common_tag
 def _test_generator(self, doc, expect):
     fp = StringIO.StringIO(doc)
     tokens = gp.generate_tokens(fp)
     self._test_generator1(tokens, expect)