Пример #1
0
class EightMySQLEngine(Connection):
	def __init__(self,host,database, user=None, password=None):
		from tornado.database import Connection as MySQLEngine
		self._db = MySQLEngine(host,database,user,password)

	def close(self):
		if self._db is not None: self._db.close()
		self._db = None

	def iter(self,query,*parameters):
		return self._db.iter(query,*parameters)

	def query(self,query,*parameters):
		return self._db.query(query,*parameters)

	def get(self,query, *parameters):
		return self._db.get(query,*parameters)

	def execute(self,query,*parameters):
		return self._db.execute(query,*parameters)

	def executemany(self,query,*parameters):
		return self._db.executemany(query,*parameters)
Пример #2
0
# Drop the existing tables
"""Comment this out so that we don't lose data from now on
tables = ['UserCompleted', 'UserInterest', 'Category', 'Activity', 'User']
for table in tables:
    sql = "DROP TABLE IF EXISTS `{0}`".format(table)
    db.execute(sql)
"""


# Create the User table
sql = """CREATE TABLE IF NOT EXISTS User(\
    name varchar(15) NOT NULL PRIMARY KEY,\
    password varchar(100) NOT NULL\
);"""
db.execute(sql)

# Create the Activity table
sql = """CREATE TABLE IF NOT EXISTS Activity(\
    ID int NOT NULL PRIMARY KEY AUTO_INCREMENT,\
    name varchar(100) NOT NULL,\
    description varchar(500),\
    creator varchar(15) NOT NULL,\
    rating int NOT NULL,\
    location varchar(100)\
);"""
db.execute(sql)

# Create the Category table
sql = """CREATE TABLE IF NOT EXISTS Category(\
    name varchar(100) NOT NULL,\
Пример #3
0
else:
    print "Operation aborted."
    sys.exit(1)

# Drop existing tables
cmd = """\
    DROP TABLE IF EXISTS `RelatesTo`;\
    DROP TABLE IF EXISTS `SpecializesIn`;\
    DROP TABLE IF EXISTS `Interests`;\
    DROP TABLE IF EXISTS `FitnessTopics`;\
    DROP TABLE IF EXISTS `Answer`;\
    DROP TABLE IF EXISTS `Question`;\
    DROP TABLE IF EXISTS `Trainer`;\
    DROP TABLE IF EXISTS `User`;\
"""
db.execute(cmd)

# Create User table
cmd = """\
CREATE TABLE `User` (\
  `user_id` INT NOT NULL AUTO_INCREMENT,\
  `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `user_email` VARCHAR(50) NULL DEFAULT NULL,\
  `age` INT NULL DEFAULT NULL,\
  `weight` INT NULL DEFAULT NULL,\
  `height` INT NULL DEFAULT NULL,\
  `address` VARCHAR(200) NOT NULL DEFAULT 'NULL',\
  `city` VARCHAR(150) NOT NULL DEFAULT 'NULL',\
  `state` VARCHAR(5) NOT NULL DEFAULT 'NULL',\
  PRIMARY KEY (`user_id`)
);\
Пример #4
0
    print "Operation aborted."
    sys.exit(1)

# Create User table
cmd = """\
CREATE TABLE `User` (\
  `user_id` INT NOT NULL AUTO_INCREMENT,\
  `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `user_email` VARCHAR(50) NULL DEFAULT NULL,\
  `first_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `last_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `password` VARCHAR(1000) NOT NULL DEFAULT 'NULL',\
  PRIMARY KEY (`user_id`)
);\
"""
db.execute(cmd)

# Create Group table
cmd = """\
CREATE TABLE `Group` (\
  `group_id` INT NOT NULL AUTO_INCREMENT,\
  `user_id` INT NOT NULL,\
  `group_name` VARCHAR(1000) NOT NULL,\
  PRIMARY KEY (`group_id`),\
  FOREIGN KEY (user_id) REFERENCES `User` (`user_id`)\
);\
"""
db.execute(cmd)

# Create Notification table
cmd = """\
Пример #5
0
class Robot:
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]',
                                       re.U | re.I)
        self.price_pattern = re.compile(
            ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月',
            re.U | re.I)
        self.area_pattern = re.compile(
            ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)',
            re.U | re.I)
        self.arch_pattern = re.compile(
            ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]',
            re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>',
                                        re.U | re.I)
        self.address_pattern = re.compile(
            ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)
        self.district_pattern = re.compile(
            ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(),
                                           urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600  # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True

    def is_valid_url(self, url):
        if len(url) > self.max_url_length:
            return False
        if url.find('#') != -1 or url.find('javascript:') != -1 or url.find(
                'file://') != -1:
            return False
        else:
            return True

    def get_all_links(self, page):
        return self.link_pattern.findall(page)

    def get_price(self, page):
        m = self.price_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_price_length:
            return None
        return m.group(3)

    def get_address(self, page):
        m = self.address_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_address_length:
            return None
        return m.group(3)

    def get_area(self, page):
        m = self.area_pattern.search(page)
        if m == None or len(m.group(4)) > self.max_area_length:
            return None
        return m.group(4)

    def get_arch(self, page):
        m = self.arch_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_arch_length:
            return None
        return m.group(3)

    def get_title(self, page):
        m = self.title_pattern.search(page)
        if m == None or len(m.group(1)) > self.max_title_length:
            return None
        return m.group(1)

    def get_district(self, page):
        m = self.district_pattern.search(page)
        if m == None or len(m.group(5)) > self.max_district_length:
            return None
        return m.group(5)

    def get_date(self, page):
        ts = str(int(time.mktime(datetime.datetime.now().timetuple())))
        return ts

    def analyse(self, page):
        title = self.get_title(page)
        if title == None:
            print 'No title'
            return None
        price = self.get_price(page)
        if price == None:
            print 'No price'
            return None
        area = self.get_area(page)
        if area == None:
            print 'No area'
            return None
        arch = self.get_arch(page)
        if arch == None:
            print 'No arch'
            return None
        address = self.get_address(page)
        if address == None:
            print 'No address'
            return None
        district = self.get_district(page)
        if district == None:
            print 'No district'
            return None
        date = self.get_date(page)
        if date == None:
            print 'Noe date'
            return None

        return [title, price, area, arch, address, district, date]

    def add_page_to_index(self, url, page):
        print 'Adding %s to index...' % url
        result = self.analyse(page)

        if result == None:
            return

        self.add_result_to_db(url, result)

    def add_result_to_db(self, url, result):
        print '...Adding %s to db...' % url
        for i in range(len(result)):
            result[i] = result[i].encode('utf-8')
            if self.debug:
                print result[i]

        title, price, area, arch, address, district, date = result

        try:
            price = int(price)
            area = int(float(area) * 100)
        except:
            print 'price or area may not be a number.'
            return

        dups = self.db.query("select * from pages where url=%s limit 1", url)
        if len(dups) == 1:
            dup = dups[0]
            print dup.title, dup.price, dup.area
            print dup.arch, dup.address, dup.district, dup.date

            if price == dup.price and area == dup.area:
                print 'Info already in database.'
                return

        print 'Insert into database...'

        self.db.execute(
            "insert into pages (url, price, address, area, arch, title, district, date) "
            "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update "
            "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s",
            url, price, address, area, arch, title, district, date, price,
            address, area, arch, title, district, date)

    def get_page(self, url):
        print 'Getting page %s...' % url

        if not url.startswith('http://'):
            print 'URL format error.'
            return None

        try:
            ans = self.opener.open(url).read().decode(self.charset)
        except:
            print 'URL open error.'
            self.current_urlopen_error += 1
            return None

        self.current_urlopen_error = 0
        return ans

    def is_allowed(self, url):
        return self.rerp.is_allowed('*', url)

    def get_full_url(self, parent, url):
        if parent.find('/', 7) == -1:
            parent += '/'
        if url.startswith('http'):
            ans = url
        elif url.startswith('/'):
            ans = parent[:parent.find('/', 7)] + url
        else:
            ans = parent[:parent.rfind('/')] + '/' + url
        return ans

    def crawl_web(self):
        tocrawl = set([self.root])
        crawled = set()

        while len(tocrawl) > 0:
            url = tocrawl.pop()
            if not self.is_allowed(url):
                print 'URL %s is not allowed.' % url
                continue

            crawled.add(url)
            page = self.get_page(url)
            if page == None:
                continue

            links = self.get_all_links(page)

            for link in links:
                #print url, link
                full_link = self.get_full_url(url, link)
                #print full_link
                if self.is_valid_url(full_link) and full_link not in crawled:
                    tocrawl.add(full_link)

            self.add_page_to_index(url, page)

            time.sleep(self.min_delay_seconds)

            if self.current_urlopen_error > self.max_allowed_urlopen_error:
                break

            if self.current_urlopen_error > self.max_allowed_urlopen_error / 2:
                time.sleep(10 * 60)  # Wait 10 minites
                self.min_delay_seconds *= 1.1

    def start(self):
        while True:
            print 'BEGIN'
            self.crawl_web()
            print 'END'
            time.sleep(30 * 60)  # Wait 30 minites
Пример #6
0
class Robot:
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I)
        self.price_pattern = re.compile(ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I)
        self.area_pattern = re.compile(ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I)
        self.arch_pattern = re.compile(ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I)
        self.address_pattern = re.compile(ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)
        self.district_pattern = re.compile(ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)]  + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True

    def is_valid_url(self, url):
        if len(url) > self.max_url_length:
            return False
        if url.find('#') != -1 or url.find('javascript:') != -1 or url.find('file://') != -1:
            return False
        else:
            return True

    def get_all_links(self, page):
        return self.link_pattern.findall(page)

    def get_price(self, page):
        m = self.price_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_price_length:
            return None
        return m.group(3)

    def get_address(self, page):
        m = self.address_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_address_length:
            return None
        return m.group(3)

    def get_area(self, page):
        m = self.area_pattern.search(page)
        if m == None or len(m.group(4)) > self.max_area_length:
            return None
        return m.group(4)

    def get_arch(self, page):
        m = self.arch_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_arch_length:
            return None
        return m.group(3)

    def get_title(self, page):
        m = self.title_pattern.search(page)
        if m == None or len(m.group(1)) > self.max_title_length:
            return None
        return m.group(1)

    def get_district(self, page):
        m = self.district_pattern.search(page)
        if m == None or len(m.group(5)) > self.max_district_length:
            return None
        return m.group(5)

    def get_date(self, page):
        ts = str(int(time.mktime(datetime.datetime.now().timetuple())))
        return ts

    def analyse(self, page):
        title = self.get_title(page)
        if title == None:
            print 'No title'
            return None
        price = self.get_price(page)
        if price == None:
            print 'No price'
            return None
        area = self.get_area(page)
        if area == None:
            print 'No area'
            return None
        arch = self.get_arch(page)
        if arch == None:
            print 'No arch'
            return None
        address = self.get_address(page)
        if address == None:
            print 'No address'
            return None
        district = self.get_district(page)
        if district == None:
            print 'No district'
            return None
        date = self.get_date(page)
        if date == None:
            print 'Noe date'
            return None

        return [title, price, area, arch, address, district, date]

    def add_page_to_index(self, url, page):
        print 'Adding %s to index...' % url
        result = self.analyse(page)

        if result == None:
            return

        self.add_result_to_db(url, result)

    def add_result_to_db(self, url, result):
        print '...Adding %s to db...' % url
        for i in range(len(result)):
            result[i] = result[i].encode('utf-8')
            if self.debug:
                print result[i]

        title, price, area, arch, address, district, date = result

        try:
            price = int(price)
            area = int(float(area) * 100)
        except:
            print 'price or area may not be a number.'
            return

        dups = self.db.query("select * from pages where url=%s limit 1", url)
        if len(dups) == 1:
            dup = dups[0]
            print dup.title, dup.price, dup.area
            print dup.arch, dup.address, dup.district, dup.date
            
            if price == dup.price and area == dup.area:
                print 'Info already in database.'
                return

        print 'Insert into database...'

        self.db.execute("insert into pages (url, price, address, area, arch, title, district, date) "
                 "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update "
                 "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s",
                 url, price, address, area, arch, title, district, date,
                 price, address, area, arch, title, district, date)

    def get_page(self, url):
        print 'Getting page %s...' % url

        if not url.startswith('http://'):
            print 'URL format error.'
            return None

        try:
            ans = self.opener.open(url).read().decode(self.charset)
        except:
            print 'URL open error.'
            self.current_urlopen_error += 1
            return None

        self.current_urlopen_error = 0
        return ans

    def is_allowed(self, url):
        return self.rerp.is_allowed('*', url)

    def get_full_url(self, parent, url):
        if parent.find('/', 7) == -1:
            parent += '/'
        if url.startswith('http'):
            ans = url
        elif url.startswith('/'):
            ans = parent[:parent.find('/', 7)] + url
        else:
            ans = parent[:parent.rfind('/')] + '/' + url
        return ans

    def crawl_web(self):
        tocrawl = set([self.root])
        crawled = set()

        while len(tocrawl) > 0:
            url = tocrawl.pop()
            if not self.is_allowed(url):
                print 'URL %s is not allowed.' % url
                continue

            crawled.add(url)
            page = self.get_page(url)
            if page == None:
                continue

            links = self.get_all_links(page)

            for link in links:
                #print url, link
                full_link = self.get_full_url(url, link)
                #print full_link
                if self.is_valid_url(full_link) and full_link not in crawled:
                    tocrawl.add(full_link)

            self.add_page_to_index(url, page)

            time.sleep(self.min_delay_seconds)
            
            if self.current_urlopen_error > self.max_allowed_urlopen_error:
                break

            if self.current_urlopen_error > self.max_allowed_urlopen_error / 2:
                time.sleep(10 * 60) # Wait 10 minites
                self.min_delay_seconds *= 1.1
            
        

    def start(self):
        while True:
            print 'BEGIN'
            self.crawl_web()
            print 'END'
            time.sleep(30 * 60) # Wait 30 minites