def __init__(self): self.database = MongoSupply() self.data = [] self.window = WINDOW_SIZE self.HOST = "" self.prefix = "" self.suffix = "" self.MAX_PAGE = 47
import tornado.httpserver import tornado.ioloop import tornado.options import tornado.web from mongosupply import MongoSupply from tornado.options import define, options database = MongoSupply() define("port", default = 8000, help = "run on the given port", type = int) class CityTopHandler(tornado.web.RequestHandler): def post(self): str_json = self.get_argument("data") dict_json = eval(str_json) dict_id = dict_json["id"] del dict_json["id"] list_json = [{key: {"$regex": dict_json[key]}} for key in dict_json] obj = {"$or": list_json} if dict_id == "": dict_id = "54503ce46a63436a5088db00" # a very tiny ObjectId self.write(database.query(obj)) self.add_header("new", database.newer(dict_id, obj)) self.flush() class LatestHandler(tornado.web.RequestHandler): def get(self): self.write(database.latest()) self.flush() def post(self): str_json = self.get_argument("data")
import datetime from cn0256 import Crawler0256 from cn56110 import Crawler56110 from com51yunli import Crawler51yunli from com8glw import Crawler8glw from comchinawutong import CrawlerChinawutong from comfala56 import CrawlerFala56 from net56888 import Crawler56888 from mongosupply import MongoSupply database = MongoSupply() frequency = [[Crawler0256, Crawler51yunli], [Crawler56888, CrawlerFala56, CrawlerChinawutong, Crawler56888, Crawler8glw, Crawler56110]] minute = datetime.datetime.now().minute crawlers = [item[minute % len(item)] for item in frequency] for c in crawlers: data = c().crawl() for item in reversed(data): item["datetime"] = datetime.datetime.now() database.insert(item)
class Crawler(): def __init__(self): self.database = MongoSupply() self.data = [] self.window = WINDOW_SIZE self.HOST = "" self.prefix = "" self.suffix = "" self.MAX_PAGE = 47 def crawl(self): count = 1 while self.window > 0 and count < self.MAX_PAGE: page = self.get(self.request(count)) if page is None: break self.uniform(page) count += 1 logging.info("Successful to fetch %d items from %s", len(self.data), self.HOST) return self.data def get(self, request): try: response = urllib.request.urlopen(request, timeout=TIMEOUT) except urllib.error.URLError: logging.warning("Failed to fetch: " + request.full_url) return None page = response.read() try: page = page.decode("utf8") except UnicodeDecodeError: page = page.decode("gbk") return page def request(self, num_page): url = self.HOST + self.prefix + str(num_page) + self.suffix return self.url2request(url) def url2request(self, url): return urllib.request.Request(url) def uniform(self, page): pass def good(self, item): keys = [ "site", "url", "from", "to", "date", "deadline", "title", "type", "volume", "quality", "packing", "vehicle", "length", "attention", "fee", "contact", "others", "datetime" ] data = {} for key in keys: data[key] = "" for key in item: data[key] = item[key] return data def lifetime(self, begin, end=LIFETIME): date = self.list2date(begin) if type(end) is int or len(end) < 2: # chinawutong ['长期货源'] if type(end) is not int: end = LIFETIME deadline = date + datetime.timedelta(end) else: deadline = self.list2date(end) if date > self.today(): date = self.today() if deadline < date: deadline = date + datetime.timedelta(LIFETIME) return date, deadline def list2date(self, date): today = self.today() date = [int(x) for x in ([today.year] + date)[-3:]] return today.replace(date[0], date[1], date[-1]) def today(self): return datetime.datetime.combine(datetime.date.today(), datetime.time()) def exist(self, obj): return self.database.find_one(obj) is not None
class Crawler(): def __init__(self): self.database = MongoSupply() self.data = [] self.window = WINDOW_SIZE self.HOST = "" self.prefix = "" self.suffix = "" self.MAX_PAGE = 47 def crawl(self): count = 1 while self.window > 0 and count < self.MAX_PAGE: page = self.get(self.request(count)) if page is None: break self.uniform(page) count += 1 logging.info("Successful to fetch %d items from %s", len(self.data), self.HOST) return self.data def get(self, request): try: response = urllib.request.urlopen(request, timeout = TIMEOUT) except urllib.error.URLError: logging.warning("Failed to fetch: " + request.full_url) return None page = response.read() try: page = page.decode("utf8") except UnicodeDecodeError: page = page.decode("gbk") return page def request(self, num_page): url = self.HOST + self.prefix + str(num_page) + self.suffix return self.url2request(url) def url2request(self, url): return urllib.request.Request(url) def uniform(self, page): pass def good(self, item): keys = ["site", "url", "from", "to", "date", "deadline", "title", "type", "volume", "quality", "packing", "vehicle", "length", "attention", "fee", "contact", "others", "datetime"] data = {} for key in keys: data[key] = "" for key in item: data[key] = item[key] return data def lifetime(self, begin, end = LIFETIME): date = self.list2date(begin) if type(end) is int or len(end) < 2: # chinawutong ['长期货源'] if type(end) is not int: end = LIFETIME deadline = date + datetime.timedelta(end) else: deadline = self.list2date(end) if date > self.today(): date = self.today() if deadline < date: deadline = date + datetime.timedelta(LIFETIME) return date, deadline def list2date(self, date): today = self.today() date = [int(x) for x in ([today.year] + date)[-3:]] return today.replace(date[0], date[1], date[-1]) def today(self): return datetime.datetime.combine(datetime.date.today(), datetime.time()) def exist(self, obj): return self.database.find_one(obj) is not None
import datetime from mongosupply import MongoSupply database = MongoSupply() today = datetime.datetime.combine(datetime.date.today(), datetime.time()) obj = {"deadline": {"$lt": today}} database.remove(obj)