Exemplo n.º 1
0
def fetchurl(url,encoding="utf8"):
    """
                输出url的文档内容
    """
    req = urllib.request.Request(url);
    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31");
    req.add_header("Accept-Charset","utf-8;q=0.7,*;q=0.3")
    filehandle = urllib.request.urlopen(req)
    content_encoding = ( "Content-Encoding" in filehandle.headers and filehandle.headers["Content-Encoding"] or "" ) ;
    isgzip = re.search("gzip",content_encoding,re.I) is not None
    contentType = filehandle.headers["Content-Type"];
    matcher = re.search(_regex_charset,contentType,re.I)
    if isgzip:
        compresseddata = filehandle.read()
        compressedstream = io.StringIO(compresseddata)
        gzipper = gzip.GzipFile(fileobj=compressedstream)
        ret = gzipper.read()
    else:
        ret = filehandle.read()
    ret = ret.decode();
    if matcher is None:
        matcher = re.search(_regex_charset,ret,re.I)
    if matcher is not None:
        charset = matcher.group(2)
        log.info("源文件编码为:"+charset)
        if charset.lower() in ("utf8,utf-8") and encoding.lower() in ("utf8,utf-8"):
            pass;
        elif charset.lower() != encoding.lower():
            log.info("进行转码,从"+charset+"转为"+encoding);
            ret = ret.encode(charset,"ignore").decode(encoding,"ignore")
    filehandle.close()
    return ret;
Exemplo n.º 2
0
 def run(self):
     queue = self.queue
     while True:
         try:
             task = queue.get() 
             log.info("worker执行"+str(task))
             task._execute()
         except Exception as e:
             log.exception(e)
Exemplo n.º 3
0
def main():
    import os
    tornado.options.parse_command_line()
    static_path = os.path.join(os.path.dirname(__file__),"static")
    application = tornado.web.Application([
        (r"/static/(.*)", web.StaticFileHandler, {"path": static_path}),
         (r"/hello", Hello),
    ])
    http_server = tornado.httpserver.HTTPServer(application)
    http_server.listen(18080)
    log.info("Server start at %d",18080)
    tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 4
0
def main():
    #dao.init("127.0.0.1","yuntao",user="******",password="******",poolsize=5);
    import os
    tornado.options.parse_command_line()
    static_path = os.path.join(os.path.dirname(__file__),"./pages/")
    application = tornado.web.Application([
        (r"/action/readrss",business.rss.ReadRss),
        (r"/action/readarticle/(\d+)",business.rss.ReadArticle),
		(r"/pages/(.*)", web.StaticFileHandler, {"path": static_path}),
        (r"/", RootRedirectHandler),
    ])
    http_server = tornado.httpserver.HTTPServer(application)
    http_server.listen(80,address="0.0.0.0")
    log.info("Server start at %d",80)
    #rss爬取
    scheduler = executors.new_scheduled_threadpool(1)
    scheduler.schedule(business.rss.crawler.Crawler().crawl,delay=10,period=3600)
    tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 5
0
 def fetchrss(self,url):
     page = http.fetchurl(url)
     root = etree.XML(page)[0];
     for child in root:
         if child.tag=="item":
             title = child.findall("title")[0].text
             desc = child.findall("description")[0].text
             pubdate = child.findall("pubDate")[0].text
             if re.match("^[a-zA-Z]{3}\\,", pubdate):
                 pubdate = pubdate[0:len("Thu, 19 Feb 2009 16:00:07")] #英文时间格式
                 DATE_FORMAT = '%a, %d %b %Y %H:%M:%S'
                 pubdate = dates.str2datetime(pubdate, DATE_FORMAT)
             elif re.match("^\d{4}-\d{2}-\d{2}T.{8,}", pubdate):
                 pubdate = pubdate[0:19]
                 DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
                 pubdate = dates.str2datetime(pubdate, DATE_FORMAT)
             if not self.dao.exists("select count(0) from rss_article where source=%s and pubdate=%s",url,pubdate):
                 log.info("采集到文章:%s"%title);
                 self.dao.insert("insert into rss_article(title,description,pubdate,source) values(%s,%s,%s,%s)",title,desc,pubdate,url);
Exemplo n.º 6
0
def main():
    #dao.init("127.0.0.1","yuntao",user="******",password="******",poolsize=5);
    import os
    tornado.options.parse_command_line()
    static_path = os.path.join(os.path.dirname(__file__), "./pages/")
    application = tornado.web.Application([
        (r"/action/readrss", business.rss.ReadRss),
        (r"/action/readarticle/(\d+)", business.rss.ReadArticle),
        (r"/pages/(.*)", web.StaticFileHandler, {
            "path": static_path
        }),
        (r"/", RootRedirectHandler),
    ])
    http_server = tornado.httpserver.HTTPServer(application)
    http_server.listen(80, address="0.0.0.0")
    log.info("Server start at %d", 80)
    #rss爬取
    scheduler = executors.new_scheduled_threadpool(1)
    scheduler.schedule(business.rss.crawler.Crawler().crawl,
                       delay=10,
                       period=3600)
    tornado.ioloop.IOLoop.instance().start()
Exemplo n.º 7
0
def fetchurl(url, encoding="utf8"):
    """
                输出url的文档内容
    """
    req = urllib.request.Request(url)
    req.add_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31"
    )
    req.add_header("Accept-Charset", "utf-8;q=0.7,*;q=0.3")
    filehandle = urllib.request.urlopen(req)
    content_encoding = ("Content-Encoding" in filehandle.headers
                        and filehandle.headers["Content-Encoding"] or "")
    isgzip = re.search("gzip", content_encoding, re.I) is not None
    contentType = filehandle.headers["Content-Type"]
    matcher = re.search(_regex_charset, contentType, re.I)
    if isgzip:
        compresseddata = filehandle.read()
        compressedstream = io.StringIO(compresseddata)
        gzipper = gzip.GzipFile(fileobj=compressedstream)
        ret = gzipper.read()
    else:
        ret = filehandle.read()
    ret = ret.decode()
    if matcher is None:
        matcher = re.search(_regex_charset, ret, re.I)
    if matcher is not None:
        charset = matcher.group(2)
        log.info("源文件编码为:" + charset)
        if charset.lower() in ("utf8,utf-8") and encoding.lower() in (
                "utf8,utf-8"):
            pass
        elif charset.lower() != encoding.lower():
            log.info("进行转码,从" + charset + "转为" + encoding)
            ret = ret.encode(charset, "ignore").decode(encoding, "ignore")
    filehandle.close()
    return ret
Exemplo n.º 8
0
 def fetchrss(self, url):
     page = http.fetchurl(url)
     root = etree.XML(page)[0]
     for child in root:
         if child.tag == "item":
             title = child.findall("title")[0].text
             desc = child.findall("description")[0].text
             pubdate = child.findall("pubDate")[0].text
             if re.match("^[a-zA-Z]{3}\\,", pubdate):
                 pubdate = pubdate[0:len(
                     "Thu, 19 Feb 2009 16:00:07")]  #英文时间格式
                 DATE_FORMAT = '%a, %d %b %Y %H:%M:%S'
                 pubdate = dates.str2datetime(pubdate, DATE_FORMAT)
             elif re.match("^\d{4}-\d{2}-\d{2}T.{8,}", pubdate):
                 pubdate = pubdate[0:19]
                 DATE_FORMAT = '%Y-%m-%dT%H:%M:%S'
                 pubdate = dates.str2datetime(pubdate, DATE_FORMAT)
             if not self.dao.exists(
                     "select count(0) from rss_article where source=%s and pubdate=%s",
                     url, pubdate):
                 log.info("采集到文章:%s" % title)
                 self.dao.insert(
                     "insert into rss_article(title,description,pubdate,source) values(%s,%s,%s,%s)",
                     title, desc, pubdate, url)