def fetchurl(url,encoding="utf8"): """ 输出url的文档内容 """ req = urllib.request.Request(url); req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31"); req.add_header("Accept-Charset","utf-8;q=0.7,*;q=0.3") filehandle = urllib.request.urlopen(req) content_encoding = ( "Content-Encoding" in filehandle.headers and filehandle.headers["Content-Encoding"] or "" ) ; isgzip = re.search("gzip",content_encoding,re.I) is not None contentType = filehandle.headers["Content-Type"]; matcher = re.search(_regex_charset,contentType,re.I) if isgzip: compresseddata = filehandle.read() compressedstream = io.StringIO(compresseddata) gzipper = gzip.GzipFile(fileobj=compressedstream) ret = gzipper.read() else: ret = filehandle.read() ret = ret.decode(); if matcher is None: matcher = re.search(_regex_charset,ret,re.I) if matcher is not None: charset = matcher.group(2) log.info("源文件编码为:"+charset) if charset.lower() in ("utf8,utf-8") and encoding.lower() in ("utf8,utf-8"): pass; elif charset.lower() != encoding.lower(): log.info("进行转码,从"+charset+"转为"+encoding); ret = ret.encode(charset,"ignore").decode(encoding,"ignore") filehandle.close() return ret;
def run(self): queue = self.queue while True: try: task = queue.get() log.info("worker执行"+str(task)) task._execute() except Exception as e: log.exception(e)
def main(): import os tornado.options.parse_command_line() static_path = os.path.join(os.path.dirname(__file__),"static") application = tornado.web.Application([ (r"/static/(.*)", web.StaticFileHandler, {"path": static_path}), (r"/hello", Hello), ]) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(18080) log.info("Server start at %d",18080) tornado.ioloop.IOLoop.instance().start()
def main(): #dao.init("127.0.0.1","yuntao",user="******",password="******",poolsize=5); import os tornado.options.parse_command_line() static_path = os.path.join(os.path.dirname(__file__),"./pages/") application = tornado.web.Application([ (r"/action/readrss",business.rss.ReadRss), (r"/action/readarticle/(\d+)",business.rss.ReadArticle), (r"/pages/(.*)", web.StaticFileHandler, {"path": static_path}), (r"/", RootRedirectHandler), ]) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(80,address="0.0.0.0") log.info("Server start at %d",80) #rss爬取 scheduler = executors.new_scheduled_threadpool(1) scheduler.schedule(business.rss.crawler.Crawler().crawl,delay=10,period=3600) tornado.ioloop.IOLoop.instance().start()
def fetchrss(self,url): page = http.fetchurl(url) root = etree.XML(page)[0]; for child in root: if child.tag=="item": title = child.findall("title")[0].text desc = child.findall("description")[0].text pubdate = child.findall("pubDate")[0].text if re.match("^[a-zA-Z]{3}\\,", pubdate): pubdate = pubdate[0:len("Thu, 19 Feb 2009 16:00:07")] #英文时间格式 DATE_FORMAT = '%a, %d %b %Y %H:%M:%S' pubdate = dates.str2datetime(pubdate, DATE_FORMAT) elif re.match("^\d{4}-\d{2}-\d{2}T.{8,}", pubdate): pubdate = pubdate[0:19] DATE_FORMAT = '%Y-%m-%dT%H:%M:%S' pubdate = dates.str2datetime(pubdate, DATE_FORMAT) if not self.dao.exists("select count(0) from rss_article where source=%s and pubdate=%s",url,pubdate): log.info("采集到文章:%s"%title); self.dao.insert("insert into rss_article(title,description,pubdate,source) values(%s,%s,%s,%s)",title,desc,pubdate,url);
def main(): #dao.init("127.0.0.1","yuntao",user="******",password="******",poolsize=5); import os tornado.options.parse_command_line() static_path = os.path.join(os.path.dirname(__file__), "./pages/") application = tornado.web.Application([ (r"/action/readrss", business.rss.ReadRss), (r"/action/readarticle/(\d+)", business.rss.ReadArticle), (r"/pages/(.*)", web.StaticFileHandler, { "path": static_path }), (r"/", RootRedirectHandler), ]) http_server = tornado.httpserver.HTTPServer(application) http_server.listen(80, address="0.0.0.0") log.info("Server start at %d", 80) #rss爬取 scheduler = executors.new_scheduled_threadpool(1) scheduler.schedule(business.rss.crawler.Crawler().crawl, delay=10, period=3600) tornado.ioloop.IOLoop.instance().start()
def fetchurl(url, encoding="utf8"): """ 输出url的文档内容 """ req = urllib.request.Request(url) req.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.64 Safari/537.31" ) req.add_header("Accept-Charset", "utf-8;q=0.7,*;q=0.3") filehandle = urllib.request.urlopen(req) content_encoding = ("Content-Encoding" in filehandle.headers and filehandle.headers["Content-Encoding"] or "") isgzip = re.search("gzip", content_encoding, re.I) is not None contentType = filehandle.headers["Content-Type"] matcher = re.search(_regex_charset, contentType, re.I) if isgzip: compresseddata = filehandle.read() compressedstream = io.StringIO(compresseddata) gzipper = gzip.GzipFile(fileobj=compressedstream) ret = gzipper.read() else: ret = filehandle.read() ret = ret.decode() if matcher is None: matcher = re.search(_regex_charset, ret, re.I) if matcher is not None: charset = matcher.group(2) log.info("源文件编码为:" + charset) if charset.lower() in ("utf8,utf-8") and encoding.lower() in ( "utf8,utf-8"): pass elif charset.lower() != encoding.lower(): log.info("进行转码,从" + charset + "转为" + encoding) ret = ret.encode(charset, "ignore").decode(encoding, "ignore") filehandle.close() return ret
def fetchrss(self, url): page = http.fetchurl(url) root = etree.XML(page)[0] for child in root: if child.tag == "item": title = child.findall("title")[0].text desc = child.findall("description")[0].text pubdate = child.findall("pubDate")[0].text if re.match("^[a-zA-Z]{3}\\,", pubdate): pubdate = pubdate[0:len( "Thu, 19 Feb 2009 16:00:07")] #英文时间格式 DATE_FORMAT = '%a, %d %b %Y %H:%M:%S' pubdate = dates.str2datetime(pubdate, DATE_FORMAT) elif re.match("^\d{4}-\d{2}-\d{2}T.{8,}", pubdate): pubdate = pubdate[0:19] DATE_FORMAT = '%Y-%m-%dT%H:%M:%S' pubdate = dates.str2datetime(pubdate, DATE_FORMAT) if not self.dao.exists( "select count(0) from rss_article where source=%s and pubdate=%s", url, pubdate): log.info("采集到文章:%s" % title) self.dao.insert( "insert into rss_article(title,description,pubdate,source) values(%s,%s,%s,%s)", title, desc, pubdate, url)