Exemplo n.º 1
0
 def __init__(self):
     self.database = MongoSupply()
     self.data = []
     self.window = WINDOW_SIZE
     self.HOST = ""
     self.prefix = ""
     self.suffix = ""
     self.MAX_PAGE = 47
Exemplo n.º 2
0
 def __init__(self):
     self.database = MongoSupply()
     self.data = []
     self.window = WINDOW_SIZE
     self.HOST = ""
     self.prefix = ""
     self.suffix = ""
     self.MAX_PAGE = 47
Exemplo n.º 3
0
import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web
from mongosupply import MongoSupply
from tornado.options import define, options

database = MongoSupply()
define("port", default = 8000, help = "run on the given port", type = int)

class CityTopHandler(tornado.web.RequestHandler):
    def post(self):
        str_json = self.get_argument("data")
        dict_json = eval(str_json)
        dict_id = dict_json["id"]
        del dict_json["id"]
        list_json = [{key: {"$regex": dict_json[key]}} for key in dict_json]
        obj = {"$or": list_json}
        if dict_id == "":
            dict_id = "54503ce46a63436a5088db00"    # a very tiny ObjectId
        self.write(database.query(obj))
        self.add_header("new", database.newer(dict_id, obj))
        self.flush()

class LatestHandler(tornado.web.RequestHandler):
    def get(self):
        self.write(database.latest())
        self.flush()

    def post(self):
        str_json = self.get_argument("data")
Exemplo n.º 4
0
import datetime
from cn0256 import Crawler0256
from cn56110 import Crawler56110
from com51yunli import Crawler51yunli
from com8glw import Crawler8glw
from comchinawutong import CrawlerChinawutong
from comfala56 import CrawlerFala56
from net56888 import Crawler56888
from mongosupply import MongoSupply

database = MongoSupply()
frequency = [[Crawler0256, Crawler51yunli], [Crawler56888, CrawlerFala56, 
          CrawlerChinawutong, Crawler56888, Crawler8glw, Crawler56110]]
minute = datetime.datetime.now().minute
crawlers = [item[minute % len(item)] for item in frequency]
for c in crawlers:
    data = c().crawl()
    for item in reversed(data):
        item["datetime"] = datetime.datetime.now()
        database.insert(item)
Exemplo n.º 5
0
class Crawler():
    def __init__(self):
        self.database = MongoSupply()
        self.data = []
        self.window = WINDOW_SIZE
        self.HOST = ""
        self.prefix = ""
        self.suffix = ""
        self.MAX_PAGE = 47

    def crawl(self):
        count = 1
        while self.window > 0 and count < self.MAX_PAGE:
            page = self.get(self.request(count))
            if page is None:
                break
            self.uniform(page)
            count += 1
        logging.info("Successful to fetch %d items from %s", len(self.data),
                     self.HOST)
        return self.data

    def get(self, request):
        try:
            response = urllib.request.urlopen(request, timeout=TIMEOUT)
        except urllib.error.URLError:
            logging.warning("Failed to fetch: " + request.full_url)
            return None
        page = response.read()
        try:
            page = page.decode("utf8")
        except UnicodeDecodeError:
            page = page.decode("gbk")
        return page

    def request(self, num_page):
        url = self.HOST + self.prefix + str(num_page) + self.suffix
        return self.url2request(url)

    def url2request(self, url):
        return urllib.request.Request(url)

    def uniform(self, page):
        pass

    def good(self, item):
        keys = [
            "site", "url", "from", "to", "date", "deadline", "title", "type",
            "volume", "quality", "packing", "vehicle", "length", "attention",
            "fee", "contact", "others", "datetime"
        ]
        data = {}
        for key in keys:
            data[key] = ""
        for key in item:
            data[key] = item[key]
        return data

    def lifetime(self, begin, end=LIFETIME):
        date = self.list2date(begin)
        if type(end) is int or len(end) < 2:  # chinawutong ['长期货源']
            if type(end) is not int:
                end = LIFETIME
            deadline = date + datetime.timedelta(end)
        else:
            deadline = self.list2date(end)
        if date > self.today():
            date = self.today()
        if deadline < date:
            deadline = date + datetime.timedelta(LIFETIME)
        return date, deadline

    def list2date(self, date):
        today = self.today()
        date = [int(x) for x in ([today.year] + date)[-3:]]
        return today.replace(date[0], date[1], date[-1])

    def today(self):
        return datetime.datetime.combine(datetime.date.today(),
                                         datetime.time())

    def exist(self, obj):
        return self.database.find_one(obj) is not None
Exemplo n.º 6
0
class Crawler():
    def __init__(self):
        self.database = MongoSupply()
        self.data = []
        self.window = WINDOW_SIZE
        self.HOST = ""
        self.prefix = ""
        self.suffix = ""
        self.MAX_PAGE = 47

    def crawl(self):
        count = 1
        while self.window > 0 and count < self.MAX_PAGE:
            page = self.get(self.request(count))
            if page is None:
                break
            self.uniform(page)
            count += 1
        logging.info("Successful to fetch %d items from %s", 
                len(self.data), self.HOST)
        return self.data

    def get(self, request):
        try:
            response = urllib.request.urlopen(request, timeout = TIMEOUT)
        except urllib.error.URLError:
            logging.warning("Failed to fetch: " + request.full_url)
            return None
        page = response.read()
        try:
            page = page.decode("utf8")
        except UnicodeDecodeError:
            page = page.decode("gbk")
        return page

    def request(self, num_page):
        url = self.HOST + self.prefix + str(num_page) + self.suffix
        return self.url2request(url)

    def url2request(self, url):
        return urllib.request.Request(url)

    def uniform(self, page):
        pass

    def good(self, item):
        keys = ["site", "url", "from", "to", "date", "deadline", "title", 
        "type", "volume", "quality", "packing", "vehicle", "length", 
        "attention", "fee", "contact", "others", "datetime"]
        data = {}
        for key in keys:
            data[key] = ""
        for key in item:
            data[key] = item[key]
        return data

    def lifetime(self, begin, end = LIFETIME):
        date = self.list2date(begin)
        if type(end) is int or len(end) < 2:    # chinawutong ['长期货源']
            if type(end) is not int:
                end = LIFETIME
            deadline = date + datetime.timedelta(end)
        else:
            deadline = self.list2date(end)
        if date > self.today():
            date = self.today()
        if deadline < date:
            deadline = date + datetime.timedelta(LIFETIME)
        return date, deadline

    def list2date(self, date):
        today = self.today()
        date = [int(x) for x in ([today.year] + date)[-3:]]
        return today.replace(date[0], date[1], date[-1])

    def today(self):
        return datetime.datetime.combine(datetime.date.today(), datetime.time())

    def exist(self, obj):
        return self.database.find_one(obj) is not None
import datetime
from mongosupply import MongoSupply

database = MongoSupply()
today = datetime.datetime.combine(datetime.date.today(), datetime.time())
obj = {"deadline": {"$lt": today}}
database.remove(obj)