예제 #1
0
    def POST(self):
        data = web.data()
        foo = data.split("-")
        if len(foo) > 1:
            id = int(foo[0])
            foo.remove(foo[0])
            url = "-".join(foo)

            dba.msg_text_update_title_contet(id, u"标题抓取ing", "")

            try:
                response = urllib2.urlopen(url)
                content = response.read()

                print data
                print response.headers

                if "Content-Encoding" in response.headers:
                    encoding = response.headers["Content-Encoding"]
                else:
                    encoding = "none"
                if encoding == "gzip":
                    content = gzip.GzipFile(fileobj=cStringIO.StringIO(content)).read()

                charset = self.get_charset(content)
                if charset is None:
                    charset = "gbk"

                if charset <> "utf-8":
                    print "unicode convert({})".format(charset)
                    content = unicode(content, charset)
                title = self.get_title(content)

                if title:
                    parser = HTMLParser.HTMLParser()
                    if charset == "utf-8":
                        title = parser.unescape(title.decode("utf-8"))
                    else:
                        title = parser.unescape(title)
                else:
                    title = u"月亮吃标题"
                row_count, result = dba.msg_text_update_title_contet(id, title, "")
                return result
            except urllib2.HTTPError as e:
                return "{} {}, HTTP fetch failed".format(e.code, e.read)
            except:
                s = sys.exc_info()
                print "exception {0} happened on line {1}".format(s[1], s[2].tb_lineno)
                return s[1]
        else:
            return "'{}' is invalid format, right is '4366-http://yourdomain/a/0315.html'".format(data)

        return "OK"
예제 #2
0
    def POST(self):
        data = web.data()
        foo = data.split('-')
        if len(foo) > 1:
            id = int(foo[0])
            foo.remove(foo[0])
            url = '-'.join(foo)

            # mark fetch
            dba.msg_text_update_title_contet(id, u'标题抓取ing', '')

            try:
                response = urllib2.urlopen(url)
                content = response.read()

                print data
                print response.headers

                # http header->encoding
                encoding = self.get_header_value(response, 'Content-Encoding')
                if encoding == 'gzip':
                    content = gzip.GzipFile(fileobj = cStringIO.StringIO(content)).read()

                # http header->type
                charset = None
                ctype = self.get_header_value(response, 'Content-Type')
                if ctype:
                    # for example: text/html; charset=utf-8
                    ctype = ctype.lower().replace(' ', '').strip()
                    foo = ctype.split('text/html;charset=')
                    if len(foo) == 2 and len(foo[1]) > 0:
                        charset = foo[1]

                # http body->charset
                if charset is None:
                    charset = self.get_charset_from_content(content)
                if charset is None:
                    charset = 'gbk'

                # convert
                if charset <> 'utf-8':
                    print "unicode convert({})".format(charset)
                    content = unicode(content, charset)
                # update content
                dba.msg_text_update_title_contet(id, None, content)

                title = self.get_title_from_content(content)
                if title:
                    parser = HTMLParser.HTMLParser()
                    if charset == 'utf-8':
                        title = parser.unescape(title.decode('utf-8'))
                    else:
                        title = parser.unescape(title)
                else:
                    title = u'月亮吃标题'
                row_count, result = dba.msg_text_update_title_contet(id, title, '')
                return result
            except urllib2.HTTPError as e:
                return "{} {}, HTTP fetch failed".format(e.code, e.read)
            except:
                s = sys.exc_info()
                print "exception {0} happened on line {1}".format(s[1], s[2].tb_lineno)
                return s[1]
        else:
            return "'{}' is invalid format, right is '4366-http://yourdomain/a/0315.html'".format(data)

        return "OK"