class EightMySQLEngine(Connection): def __init__(self,host,database, user=None, password=None): from tornado.database import Connection as MySQLEngine self._db = MySQLEngine(host,database,user,password) def close(self): if self._db is not None: self._db.close() self._db = None def iter(self,query,*parameters): return self._db.iter(query,*parameters) def query(self,query,*parameters): return self._db.query(query,*parameters) def get(self,query, *parameters): return self._db.get(query,*parameters) def execute(self,query,*parameters): return self._db.execute(query,*parameters) def executemany(self,query,*parameters): return self._db.executemany(query,*parameters)
# Drop the existing tables """Comment this out so that we don't lose data from now on tables = ['UserCompleted', 'UserInterest', 'Category', 'Activity', 'User'] for table in tables: sql = "DROP TABLE IF EXISTS `{0}`".format(table) db.execute(sql) """ # Create the User table sql = """CREATE TABLE IF NOT EXISTS User(\ name varchar(15) NOT NULL PRIMARY KEY,\ password varchar(100) NOT NULL\ );""" db.execute(sql) # Create the Activity table sql = """CREATE TABLE IF NOT EXISTS Activity(\ ID int NOT NULL PRIMARY KEY AUTO_INCREMENT,\ name varchar(100) NOT NULL,\ description varchar(500),\ creator varchar(15) NOT NULL,\ rating int NOT NULL,\ location varchar(100)\ );""" db.execute(sql) # Create the Category table sql = """CREATE TABLE IF NOT EXISTS Category(\ name varchar(100) NOT NULL,\
else: print "Operation aborted." sys.exit(1) # Drop existing tables cmd = """\ DROP TABLE IF EXISTS `RelatesTo`;\ DROP TABLE IF EXISTS `SpecializesIn`;\ DROP TABLE IF EXISTS `Interests`;\ DROP TABLE IF EXISTS `FitnessTopics`;\ DROP TABLE IF EXISTS `Answer`;\ DROP TABLE IF EXISTS `Question`;\ DROP TABLE IF EXISTS `Trainer`;\ DROP TABLE IF EXISTS `User`;\ """ db.execute(cmd) # Create User table cmd = """\ CREATE TABLE `User` (\ `user_id` INT NOT NULL AUTO_INCREMENT,\ `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `user_email` VARCHAR(50) NULL DEFAULT NULL,\ `age` INT NULL DEFAULT NULL,\ `weight` INT NULL DEFAULT NULL,\ `height` INT NULL DEFAULT NULL,\ `address` VARCHAR(200) NOT NULL DEFAULT 'NULL',\ `city` VARCHAR(150) NOT NULL DEFAULT 'NULL',\ `state` VARCHAR(5) NOT NULL DEFAULT 'NULL',\ PRIMARY KEY (`user_id`) );\
print "Operation aborted." sys.exit(1) # Create User table cmd = """\ CREATE TABLE `User` (\ `user_id` INT NOT NULL AUTO_INCREMENT,\ `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `user_email` VARCHAR(50) NULL DEFAULT NULL,\ `first_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `last_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `password` VARCHAR(1000) NOT NULL DEFAULT 'NULL',\ PRIMARY KEY (`user_id`) );\ """ db.execute(cmd) # Create Group table cmd = """\ CREATE TABLE `Group` (\ `group_id` INT NOT NULL AUTO_INCREMENT,\ `user_id` INT NOT NULL,\ `group_name` VARCHAR(1000) NOT NULL,\ PRIMARY KEY (`group_id`),\ FOREIGN KEY (user_id) REFERENCES `User` (`user_id`)\ );\ """ db.execute(cmd) # Create Notification table cmd = """\
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile( ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile( ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile( ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile( ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile( ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find( 'file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute( "insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile(ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile(ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile(ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile(ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile(ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find('file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute("insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites