Python decode_html示例，repaste.decode_html Python示例

示例#1

0

显示文件

文件： title.py 项目： Red-M/frogbot

def title(inp, db=None, chan=None, bot=None, input=None):
    ",title <url> - get title of <url>"
    if '^' in input.paraml[1]:
        inp = bot.chanseen[input.conn.server][input.chan][0]
        if not "http://" in str(input.paraml[1]):
            inp = str(inp).replace("www.","http://www.")
        re1='((http|https)://.* )'
        rg = re.compile(re1,re.IGNORECASE|re.DOTALL)
        m = rg.findall(inp)
        if m:
            m = (''.join(m[0])).split(" ")[0]
            inp = m
    if not ("http://" in inp or "https://" in inp):
        inp = ""
    if inp == "":
        return ",title <url> - get title of <url>"
    try:
        parts = urlparse.urlsplit(inp)
        conn = httplib.HTTPConnection(parts.hostname, timeout=10)
        path = parts.path

        if parts.query:
            path += "?" + parts.query

        conn.request('HEAD', path)
        resp = conn.getresponse()

        if not (200 <= resp.status < 400):
            return "Error: HEAD %s %s " % (resp.status, resp.reason)

        errors = check_response(dict(resp.getheaders()))

        if errors:
            return errors
    except Exception as e:
        return "Error: " + str(e)

    try:
        req = urllib2.urlopen(inp)
    except Exception as e:
        return "Error: GET %s " % e

    errors = check_response(req.headers)

    if errors:
        return errors

    text = req.read(maxlen).decode('utf8', 'ignore')

    match = titler.search(text)

    if not match:
        return "Error: no title "

    rawtitle = match.group(1)

    title = repaste.decode_html(rawtitle)
    title = " ".join(title.split())

    return u"%s <=> %s" % (title, inp)

示例#2

0

显示文件

文件： urltools.py 项目： Red-M/frogbot

def parse(match):
    url = urlnorm.normalize(match.encode('utf-8'))
    if url not in ignored_urls:
        url = url.decode('utf-8')
        try:
            parts = urlparse.urlsplit(url)
            conn = httplib.HTTPConnection(parts.hostname, timeout=10)
            path = parts.path
            if parts.query:
                path += "?" + parts.query
            conn.request('HEAD', path)
            resp = conn.getresponse()
            if not (200 <= resp.status < 400):
                return "Error: HEAD %s %s " % (resp.status, resp.reason)
            errors = check_response(dict(resp.getheaders()))
            if errors:
                return errors
        except Exception as e:
            return "Error: " + str(e)
        try:
            req = urllib2.urlopen(url)
        except Exception as e:
            return "Error: GET %s " % e
        errors = check_response(req.headers)
        if errors:
            return errors
        text = req.read(maxlen).decode('utf8', 'ignore')
        match = titler.search(text)
        if not match:
            return "Error: no title "
        rawtitle = match.group(1)
        title = repaste.decode_html(rawtitle)
        title = " ".join(title.split())
        return title

示例#3

0

显示文件

def title(inp, db=None, chan=None):
    ".title <url> - get title of <url>"
    if inp == '^':
        urlhistory.db_init(db)
        rows = db.execute(
            "select url from urlhistory where chan = ? order by time desc limit 1",
            (chan, ))
        if not rows.rowcount:
            return "No url in history."

        inp = rows.fetchone()[0]

    try:
        parts = urlparse.urlsplit(inp)
        conn = httplib.HTTPConnection(parts.hostname, timeout=10)
        path = parts.path

        if parts.query:
            path += "?" + parts.query

        conn.request('HEAD', path)
        resp = conn.getresponse()

        if not (200 <= resp.status < 400):
            return "Error: HEAD %s %s" % (resp.status, resp.reason)

        errors = check_response(dict(resp.getheaders()))

        if errors:
            return errors
    except Exception as e:
        return "Error: " + str(e)

    try:
        req = urllib2.urlopen(inp)
    except Exception as e:
        return "Error: GET %s" % e

    errors = check_response(req.headers)

    if errors:
        return errors

    text = req.read(maxlen).decode('utf8', 'ignore')

    match = titler.search(text)

    if not match:
        return "Error: no title"

    rawtitle = match.group(1)

    title = repaste.decode_html(rawtitle)
    title = " ".join(title.split())

    return title

示例#4

0

显示文件

文件： title.py 项目： GUIpsp/wololo

def title(inp, db=None, chan=None):
    ".title <url> - get title of <url>"
    if inp == '^':
        urlhistory.db_init(db)
        rows = db.execute("select url from urlhistory where chan = ? order by time desc limit 1", (chan,))
        if not rows.rowcount:
            return "No url in history. " + randout.fail()

        inp = rows.fetchone()[0]

    try:
        parts = urlparse.urlsplit(inp)
        conn = httplib.HTTPConnection(parts.hostname, timeout=10)
        path = parts.path

        if parts.query:
            path += "?" + parts.query

        conn.request('HEAD', path)
        resp = conn.getresponse()

        if not (200 <= resp.status < 400):
            return "Error: HEAD %s %s " + randout.fail() % (resp.status, resp.reason)

        errors = check_response(dict(resp.getheaders()))

        if errors:
            return errors
    except Exception as e:
        return "Error: " + str(e)

    try:
        req = urllib2.urlopen(inp)
    except Exception as e:
        return "Error: GET %s " + randout.fail() % e

    errors = check_response(req.headers)

    if errors:
        return errors

    text = req.read(maxlen).decode('utf8', 'ignore')

    match = titler.search(text)

    if not match:
        return "Error: no title " + randout.fail()

    rawtitle = match.group(1)

    title = repaste.decode_html(rawtitle)
    title = " ".join(title.split())

    return title

示例#5

0

显示文件

文件： title.py 项目： lukegb/skybot

def title(inp, db=None, chan=None):
    ".title <url> - get title of <url>"
    if inp == '^':
        urlhistory.db_init(db)
        rows = db.execute("select url from urlhistory where chan = ? order by time desc limit 1", (chan,))
        if not rows.rowcount:
            return "No url in history."

        inp = rows.fetchone()[0]

    match = titler.search(http.get(inp))

    if not match:
        return "Error: no title"

    rawtitle = match.group(1)

    title = repaste.decode_html(rawtitle)
    title = " ".join(title.split())

    return title

示例#6

0

显示文件

def title(inp, db=None, chan=None):
    ".title <url> - get title of <url>"
    if inp == '^':
        urlhistory.db_init(db)
        rows = db.execute(
            "select url from urlhistory where chan = ? order by time desc limit 1",
            (chan, ))
        if not rows.rowcount:
            return "No url in history."

        inp = rows.fetchone()[0]

    match = titler.search(http.get(inp))

    if not match:
        return "Error: no title"

    rawtitle = match.group(1)

    title = repaste.decode_html(rawtitle)
    title = " ".join(title.split())

    return title