def do_query(database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s database" % (request.remote_addr, database)) creds = get_db_creds(database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) # Prepare the database connection app.logger.debug("Connecting to %s database (%s)" % (database, request.remote_addr)) db = Connection(**creds) # See if we received a query sql = request.form.get('sql') if not sql: return {"ERROR": "SQL query missing from request."} # If the query has a percent sign, we need to excape it if '%' in sql: sql = sql.replace('%', '%%') # Attempt to run the query try: app.logger.info("%s attempting to run \" %s \" against %s database" % (request.remote_addr, sql, database)) results = db.query(sql) except Exception, e: return {"ERROR": ": ".join(str(i) for i in e.args)}
def do_query1(method=None,database=None,sql=None): desc = None sql = sql.replace('+',' ') app.logger.info("aft.sql: %s" % sql) app.logger.warning("%s requesting access to %s database" % ( request.remote_addr, database)) creds = get_db_creds(database) if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) app.logger.debug("Connecting to %s database (%s)" % ( database, request.remote_addr)) if '%' in sql: sql = sql.replace('%', '%%') if method=='json': try: app.logger.info("%s attempting to run \" %s \" against %s database" % ( request.remote_addr, sql, database)) db = Connection(**creds) results = db.query(sql) db.close() except Exception, e: results = {"ERROR": ": ".join(str(i) for i in e.args)} results = {'result': results}
def do_query(database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s database" % ( request.remote_addr, database)) creds = get_db_creds(database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) # Prepare the database connection app.logger.debug("Connecting to %s database (%s)" % ( database, request.remote_addr)) db = Connection(**creds) # See if we received a query sql = request.form.get('sql') if not sql: return {"ERROR": "SQL query missing from request."} # If the query has a percent sign, we need to excape it if '%' in sql: sql = sql.replace('%', '%%') # Attempt to run the query try: app.logger.info("%s attempting to run \" %s \" against %s database" % ( request.remote_addr, sql, database)) results = db.query(sql) except Exception, e: return {"ERROR": ": ".join(str(i) for i in e.args)}
class Application(tornado.web.Application): def __init__(self): # refer to db with self.application.db, maintains one db connection # cPanel mysql host self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db") sql = "SELECT name FROM Activity" self.trie = Trie() results = self.db.query(sql) self.activityNames = {} trie_words = [] for result in results: trie_words.append(result["name"]) self.trie.add_token_words(*trie_words) # local mysql host #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket') # will later need to change this for heroku handlers = [ tornado.web.URLSpec(r'/', LoginHandler), tornado.web.URLSpec(r'/login', LoginHandler), tornado.web.URLSpec(r'/logout', LogoutHandler), tornado.web.URLSpec(r'/signup', SignupHandler), tornado.web.URLSpec(r'/about', AboutHandler), tornado.web.URLSpec(r'/activity/new', ActivityHandler), tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler), tornado.web.URLSpec(r'/home', HomeHandler), tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler), tornado.web.URLSpec(r'/activity/delete/([0-9]+)', DeleteActivityHandler), tornado.web.URLSpec(r'/search', SearchHandler), tornado.web.URLSpec(r'/activity/remove/([0-9]+)', DeleteBucketActivityHandler), tornado.web.URLSpec(r'/top', TopHandler), tornado.web.URLSpec(r'/search/results', SearchResultsHandler), tornado.web.URLSpec(r'/activity/complete/([0-9]+)', CompleteActivityHandler), tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler), tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler), tornado.web.URLSpec(r'/mobile/complete', MobileCompleteActivityHandler), tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler), tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler) ] current_dir = os.path.dirname(__file__) settings = dict( template_path=os.path.join(current_dir, 'templates'), static_path=os.path.join(current_dir, 'static'), debug=options.debug, autoescape='xhtml_escape', cookie_secret='Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=', xsrf_cookies='True' ) super(Application, self).__init__(handlers, **settings) logging.info('Server started on port {0}'.format(options.port))
def index(environment=None, database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s db in %s environment" % (request.remote_addr, database, environment)) creds = get_db_creds(environment, database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: abort(404) # Connect to the database and run the query try: app.logger.debug("Connecting to %s db in %s environment (%s)" % (database, environment, request.remote_addr)) db = Connection(**creds) except: abort(500) try: sql = request.form['sql'].replace(r'%',r'%%') app.logger.info("%s attempting to run \" %s \" against %s in %s" % (request.remote_addr, sql,database, environment)) results = db.query(sql) except Exception as (errno, errstr): return (errno, errstr)
class EightMySQLEngine(Connection): def __init__(self,host,database, user=None, password=None): from tornado.database import Connection as MySQLEngine self._db = MySQLEngine(host,database,user,password) def close(self): if self._db is not None: self._db.close() self._db = None def iter(self,query,*parameters): return self._db.iter(query,*parameters) def query(self,query,*parameters): return self._db.query(query,*parameters) def get(self,query, *parameters): return self._db.get(query,*parameters) def execute(self,query,*parameters): return self._db.execute(query,*parameters) def executemany(self,query,*parameters): return self._db.executemany(query,*parameters)
class Application(tornado.web.Application): def __init__(self): # refer to db with self.application.db, maintains one db connection # cPanel mysql host self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db") sql = "SELECT name FROM Activity" self.trie = Trie() results = self.db.query(sql) self.activityNames = {} trie_words = [] for result in results: trie_words.append(result["name"]) self.trie.add_token_words(*trie_words) # local mysql host #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket') # will later need to change this for heroku handlers = [ tornado.web.URLSpec(r'/', LoginHandler), tornado.web.URLSpec(r'/login', LoginHandler), tornado.web.URLSpec(r'/logout', LogoutHandler), tornado.web.URLSpec(r'/signup', SignupHandler), tornado.web.URLSpec(r'/about', AboutHandler), tornado.web.URLSpec(r'/activity/new', ActivityHandler), tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler), tornado.web.URLSpec(r'/home', HomeHandler), tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler), tornado.web.URLSpec(r'/activity/delete/([0-9]+)', DeleteActivityHandler), tornado.web.URLSpec(r'/search', SearchHandler), tornado.web.URLSpec(r'/activity/remove/([0-9]+)', DeleteBucketActivityHandler), tornado.web.URLSpec(r'/top', TopHandler), tornado.web.URLSpec(r'/search/results', SearchResultsHandler), tornado.web.URLSpec(r'/activity/complete/([0-9]+)', CompleteActivityHandler), tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler), tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler), tornado.web.URLSpec(r'/mobile/complete', MobileCompleteActivityHandler), tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler), tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler) ] current_dir = os.path.dirname(__file__) settings = dict( template_path=os.path.join(current_dir, 'templates'), static_path=os.path.join(current_dir, 'static'), debug=options.debug, autoescape='xhtml_escape', cookie_secret= 'Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=', xsrf_cookies='True') super(Application, self).__init__(handlers, **settings) logging.info('Server started on port {0}'.format(options.port))
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile( ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile( ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile( ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile( ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile( ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find( 'file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute( "insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile(ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile(ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile(ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile(ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile(ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find('file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute("insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites