def get_db(): global __db__ if not __db__: __db__ = Connection("localhost", "chat", user="******", password=None) del __db__._db_args['init_command'] __db__.reconnect() return __db__
def do_query1(method=None,database=None,sql=None): desc = None sql = sql.replace('+',' ') app.logger.info("aft.sql: %s" % sql) app.logger.warning("%s requesting access to %s database" % ( request.remote_addr, database)) creds = get_db_creds(database) if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) app.logger.debug("Connecting to %s database (%s)" % ( database, request.remote_addr)) if '%' in sql: sql = sql.replace('%', '%%') if method=='json': try: app.logger.info("%s attempting to run \" %s \" against %s database" % ( request.remote_addr, sql, database)) db = Connection(**creds) results = db.query(sql) db.close() except Exception, e: results = {"ERROR": ": ".join(str(i) for i in e.args)} results = {'result': results}
def do_query(database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s database" % ( request.remote_addr, database)) creds = get_db_creds(database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) # Prepare the database connection app.logger.debug("Connecting to %s database (%s)" % ( database, request.remote_addr)) db = Connection(**creds) # See if we received a query sql = request.form.get('sql') if not sql: return {"ERROR": "SQL query missing from request."} # If the query has a percent sign, we need to excape it if '%' in sql: sql = sql.replace('%', '%%') # Attempt to run the query try: app.logger.info("%s attempting to run \" %s \" against %s database" % ( request.remote_addr, sql, database)) results = db.query(sql) except Exception, e: return {"ERROR": ": ".join(str(i) for i in e.args)}
def do_query(database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s database" % (request.remote_addr, database)) creds = get_db_creds(database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: return {"ERROR": "Unable to find credentials matching %s." % database} abort(404) # Prepare the database connection app.logger.debug("Connecting to %s database (%s)" % (database, request.remote_addr)) db = Connection(**creds) # See if we received a query sql = request.form.get('sql') if not sql: return {"ERROR": "SQL query missing from request."} # If the query has a percent sign, we need to excape it if '%' in sql: sql = sql.replace('%', '%%') # Attempt to run the query try: app.logger.info("%s attempting to run \" %s \" against %s database" % (request.remote_addr, sql, database)) results = db.query(sql) except Exception, e: return {"ERROR": ": ".join(str(i) for i in e.args)}
def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile( ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile( ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile( ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile( ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile( ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True
def UserLogin(self, controller, request, done): #print request # Extract name from the message received name = request.user_name passwd = self.enc_pass(request.user_pass) ret = False db = Connection('localhost','xituan-thinkphp','root') try: query = "select count(*) as cnt from user_login where `user_email` = '%s' and `user_password` = '%s'" % (name,passwd) logging.debug(query) ret = db.get(query) except Exception,e: logging.error(e)
def UserLogin(self, controller, request, done): #print request # Extract name from the message received name = request.user_name passwd = self.enc_pass(request.user_pass) ret = False db = Connection('localhost', 'xituan-thinkphp', 'root') try: query = "select count(*) as cnt from user_login where `user_email` = '%s' and `user_password` = '%s'" % ( name, passwd) logging.debug(query) ret = db.get(query) except Exception, e: logging.error(e)
def __init__(self): from d3status.urls import handlers, ui_modules from d3status.db import Model settings = dict(debug=options.debug, template_path=os.path.join(os.path.dirname(__file__), "templates"), static_path=os.path.join(os.path.dirname(__file__), "static"), login_url=options.login_url, xsrf_cookies=options.xsrf_cookies, cookie_secret=options.cookie_secret, ui_modules=ui_modules, #autoescape=None, ) # d3status db connection self.db = Connection(host=options.mysql["host"] + ":" + options.mysql["port"], database=options.mysql["database"], user=options.mysql["user"], password=options.mysql["password"], ) Model.setup_dbs({"db": self.db}) super(Application, self).__init__(handlers, **settings)
class Application(tornado.web.Application): def __init__(self): # refer to db with self.application.db, maintains one db connection # cPanel mysql host self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db") sql = "SELECT name FROM Activity" self.trie = Trie() results = self.db.query(sql) self.activityNames = {} trie_words = [] for result in results: trie_words.append(result["name"]) self.trie.add_token_words(*trie_words) # local mysql host #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket') # will later need to change this for heroku handlers = [ tornado.web.URLSpec(r'/', LoginHandler), tornado.web.URLSpec(r'/login', LoginHandler), tornado.web.URLSpec(r'/logout', LogoutHandler), tornado.web.URLSpec(r'/signup', SignupHandler), tornado.web.URLSpec(r'/about', AboutHandler), tornado.web.URLSpec(r'/activity/new', ActivityHandler), tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler), tornado.web.URLSpec(r'/home', HomeHandler), tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler), tornado.web.URLSpec(r'/activity/delete/([0-9]+)', DeleteActivityHandler), tornado.web.URLSpec(r'/search', SearchHandler), tornado.web.URLSpec(r'/activity/remove/([0-9]+)', DeleteBucketActivityHandler), tornado.web.URLSpec(r'/top', TopHandler), tornado.web.URLSpec(r'/search/results', SearchResultsHandler), tornado.web.URLSpec(r'/activity/complete/([0-9]+)', CompleteActivityHandler), tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler), tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler), tornado.web.URLSpec(r'/mobile/complete', MobileCompleteActivityHandler), tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler), tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler) ] current_dir = os.path.dirname(__file__) settings = dict( template_path=os.path.join(current_dir, 'templates'), static_path=os.path.join(current_dir, 'static'), debug=options.debug, autoescape='xhtml_escape', cookie_secret='Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=', xsrf_cookies='True' ) super(Application, self).__init__(handlers, **settings) logging.info('Server started on port {0}'.format(options.port))
def db(self): # Todo: Get from config if not hasattr(self, "_db"): self._db_connection = Connection(host="localhost", \ database="uploadr",\ user="******") return self._db_connection
def connect(): conn.mysql = Connection( host=options.mysql["host"] + ":" + options.mysql["port"], database=options.mysql["database"], user=options.mysql["user"], password=options.mysql["password"]) # ping db periodically to avoid mysql go away PeriodicCallback(_ping_db, int(options.mysql["recycle"]) * 1000).start()
def index(environment=None, database=None): # Pick up the database credentials app.logger.warning("%s requesting access to %s db in %s environment" % (request.remote_addr, database, environment)) creds = get_db_creds(environment, database) # If we couldn't find corresponding credentials, throw a 404 if creds == False: abort(404) # Connect to the database and run the query try: app.logger.debug("Connecting to %s db in %s environment (%s)" % (database, environment, request.remote_addr)) db = Connection(**creds) except: abort(500) try: sql = request.form['sql'].replace(r'%',r'%%') app.logger.info("%s attempting to run \" %s \" against %s in %s" % (request.remote_addr, sql,database, environment)) results = db.query(sql) except Exception as (errno, errstr): return (errno, errstr)
def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile(ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile(ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile(ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile(ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile(ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True
class EightMySQLEngine(Connection): def __init__(self,host,database, user=None, password=None): from tornado.database import Connection as MySQLEngine self._db = MySQLEngine(host,database,user,password) def close(self): if self._db is not None: self._db.close() self._db = None def iter(self,query,*parameters): return self._db.iter(query,*parameters) def query(self,query,*parameters): return self._db.query(query,*parameters) def get(self,query, *parameters): return self._db.get(query,*parameters) def execute(self,query,*parameters): return self._db.execute(query,*parameters) def executemany(self,query,*parameters): return self._db.executemany(query,*parameters)
def __init__(self): # refer to db with self.application.db, maintains one db connection # cPanel mysql host self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db") sql = "SELECT name FROM Activity" self.trie = Trie() results = self.db.query(sql) self.activityNames = {} trie_words = [] for result in results: trie_words.append(result["name"]) self.trie.add_token_words(*trie_words) # local mysql host #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket') # will later need to change this for heroku handlers = [ tornado.web.URLSpec(r'/', LoginHandler), tornado.web.URLSpec(r'/login', LoginHandler), tornado.web.URLSpec(r'/logout', LogoutHandler), tornado.web.URLSpec(r'/signup', SignupHandler), tornado.web.URLSpec(r'/about', AboutHandler), tornado.web.URLSpec(r'/activity/new', ActivityHandler), tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler), tornado.web.URLSpec(r'/home', HomeHandler), tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler), tornado.web.URLSpec(r'/activity/delete/([0-9]+)', DeleteActivityHandler), tornado.web.URLSpec(r'/search', SearchHandler), tornado.web.URLSpec(r'/activity/remove/([0-9]+)', DeleteBucketActivityHandler), tornado.web.URLSpec(r'/top', TopHandler), tornado.web.URLSpec(r'/search/results', SearchResultsHandler), tornado.web.URLSpec(r'/activity/complete/([0-9]+)', CompleteActivityHandler), tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler), tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler), tornado.web.URLSpec(r'/mobile/complete', MobileCompleteActivityHandler), tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler), tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler) ] current_dir = os.path.dirname(__file__) settings = dict( template_path=os.path.join(current_dir, 'templates'), static_path=os.path.join(current_dir, 'static'), debug=options.debug, autoescape='xhtml_escape', cookie_secret= 'Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=', xsrf_cookies='True') super(Application, self).__init__(handlers, **settings) logging.info('Server started on port {0}'.format(options.port))
from tornado.database import Connection from urlparse import urlparse import sys # This file is a script that setups the database for cucket if the tables don't already exist #DATABASE_URL = sys.argv[1] #url = urlparse(DATABASE_URL) #db = Connection(host=url.hostname, user=url.username, password=url.password, database=url.path[1:]) db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db") #db = Connection(host='localhost:3306', user='******', password='', database='cucket') # will later need to change this for heroku # Drop the existing tables """Comment this out so that we don't lose data from now on tables = ['UserCompleted', 'UserInterest', 'Category', 'Activity', 'User'] for table in tables: sql = "DROP TABLE IF EXISTS `{0}`".format(table) db.execute(sql) """ # Create the User table sql = """CREATE TABLE IF NOT EXISTS User(\ name varchar(15) NOT NULL PRIMARY KEY,\ password varchar(100) NOT NULL\ );""" db.execute(sql) # Create the Activity table sql = """CREATE TABLE IF NOT EXISTS Activity(\
from tornado.options import options from tornado.database import Connection from d3status.libs.options import parse_options parse_options() from d3status.db import Model from d3status.db import load_model from d3status.mail import send_email from d3status.tasks import status_tasks # db connection db = Connection( host=options.mysql["host"] + ":" + options.mysql["port"], database=options.mysql["database"], user=options.mysql["user"], password=options.mysql["password"], ) Model.setup_dbs({"db": db}) def update_server_status(): url = options.d3_server_status_url req = HTTPRequest(url=url) client = HTTPClient() response = client.fetch(req) if response.code == 200: status = _parse_server_status(response.body) changed_status = load_model("status").update_status(status)
prompt = """\ The script will now try to connect to... database: '%s' on host: '%s' using user: '******' """ % (database, host, user) print prompt host, db, user, password = get_db_credentials() print_connection_prompt(db, host, user) sure = raw_input('Are you sure? (yes/no) ') if sure in ('yes', 'Yes', 'y', 'Y'): db = Connection(host=host, database=db, user=user, password=password) else: print "Operation aborted." sys.exit(1) # Drop existing tables cmd = """\ DROP TABLE IF EXISTS `RelatesTo`;\ DROP TABLE IF EXISTS `SpecializesIn`;\ DROP TABLE IF EXISTS `Interests`;\ DROP TABLE IF EXISTS `FitnessTopics`;\ DROP TABLE IF EXISTS `Answer`;\ DROP TABLE IF EXISTS `Question`;\ DROP TABLE IF EXISTS `Trainer`;\ DROP TABLE IF EXISTS `User`;\ """
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile(ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile(ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile(ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile(ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile(ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find('file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute("insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites
def __init__(self, *args, **kwargs): Connection.__init__(self, *args, **kwargs) # MySQL connection will go away if no any queries within 8 hours by default PeriodicCallback(self._ping_db, 4 * 3600 * 1000).start()
def connect_db(): g.db = Connection(config.SPOTIFONT_DB_HOST, config.SPOTIFONT_DB_NAME, config.SPOTIFONT_DB_USER, config.SPOTIFONT_DB_PASSWD)
class Robot: def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile( ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile( ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile( ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile( ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile( ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True def is_valid_url(self, url): if len(url) > self.max_url_length: return False if url.find('#') != -1 or url.find('javascript:') != -1 or url.find( 'file://') != -1: return False else: return True def get_all_links(self, page): return self.link_pattern.findall(page) def get_price(self, page): m = self.price_pattern.search(page) if m == None or len(m.group(3)) > self.max_price_length: return None return m.group(3) def get_address(self, page): m = self.address_pattern.search(page) if m == None or len(m.group(3)) > self.max_address_length: return None return m.group(3) def get_area(self, page): m = self.area_pattern.search(page) if m == None or len(m.group(4)) > self.max_area_length: return None return m.group(4) def get_arch(self, page): m = self.arch_pattern.search(page) if m == None or len(m.group(3)) > self.max_arch_length: return None return m.group(3) def get_title(self, page): m = self.title_pattern.search(page) if m == None or len(m.group(1)) > self.max_title_length: return None return m.group(1) def get_district(self, page): m = self.district_pattern.search(page) if m == None or len(m.group(5)) > self.max_district_length: return None return m.group(5) def get_date(self, page): ts = str(int(time.mktime(datetime.datetime.now().timetuple()))) return ts def analyse(self, page): title = self.get_title(page) if title == None: print 'No title' return None price = self.get_price(page) if price == None: print 'No price' return None area = self.get_area(page) if area == None: print 'No area' return None arch = self.get_arch(page) if arch == None: print 'No arch' return None address = self.get_address(page) if address == None: print 'No address' return None district = self.get_district(page) if district == None: print 'No district' return None date = self.get_date(page) if date == None: print 'Noe date' return None return [title, price, area, arch, address, district, date] def add_page_to_index(self, url, page): print 'Adding %s to index...' % url result = self.analyse(page) if result == None: return self.add_result_to_db(url, result) def add_result_to_db(self, url, result): print '...Adding %s to db...' % url for i in range(len(result)): result[i] = result[i].encode('utf-8') if self.debug: print result[i] title, price, area, arch, address, district, date = result try: price = int(price) area = int(float(area) * 100) except: print 'price or area may not be a number.' return dups = self.db.query("select * from pages where url=%s limit 1", url) if len(dups) == 1: dup = dups[0] print dup.title, dup.price, dup.area print dup.arch, dup.address, dup.district, dup.date if price == dup.price and area == dup.area: print 'Info already in database.' return print 'Insert into database...' self.db.execute( "insert into pages (url, price, address, area, arch, title, district, date) " "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update " "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s", url, price, address, area, arch, title, district, date, price, address, area, arch, title, district, date) def get_page(self, url): print 'Getting page %s...' % url if not url.startswith('http://'): print 'URL format error.' return None try: ans = self.opener.open(url).read().decode(self.charset) except: print 'URL open error.' self.current_urlopen_error += 1 return None self.current_urlopen_error = 0 return ans def is_allowed(self, url): return self.rerp.is_allowed('*', url) def get_full_url(self, parent, url): if parent.find('/', 7) == -1: parent += '/' if url.startswith('http'): ans = url elif url.startswith('/'): ans = parent[:parent.find('/', 7)] + url else: ans = parent[:parent.rfind('/')] + '/' + url return ans def crawl_web(self): tocrawl = set([self.root]) crawled = set() while len(tocrawl) > 0: url = tocrawl.pop() if not self.is_allowed(url): print 'URL %s is not allowed.' % url continue crawled.add(url) page = self.get_page(url) if page == None: continue links = self.get_all_links(page) for link in links: #print url, link full_link = self.get_full_url(url, link) #print full_link if self.is_valid_url(full_link) and full_link not in crawled: tocrawl.add(full_link) self.add_page_to_index(url, page) time.sleep(self.min_delay_seconds) if self.current_urlopen_error > self.max_allowed_urlopen_error: break if self.current_urlopen_error > self.max_allowed_urlopen_error / 2: time.sleep(10 * 60) # Wait 10 minites self.min_delay_seconds *= 1.1 def start(self): while True: print 'BEGIN' self.crawl_web() print 'END' time.sleep(30 * 60) # Wait 30 minites
def print_connection_prompt(host, database, user): prompt = """\ The script will now try to connect to... database: '%s' on host: '%s' using user: '******' """ % (database, host, user) print prompt host, db, user, password = get_db_credentials() print_connection_prompt(db, host, user) sure = raw_input('Are you sure? (yes/no) ') if sure in ('yes', 'Yes', 'y', 'Y'): db = Connection(host=host, database=db, user=user, password=password) else: print "Operation aborted." sys.exit(1) # Create User table cmd = """\ CREATE TABLE `User` (\ `user_id` INT NOT NULL AUTO_INCREMENT,\ `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `user_email` VARCHAR(50) NULL DEFAULT NULL,\ `first_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `last_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\ `password` VARCHAR(1000) NOT NULL DEFAULT 'NULL',\ PRIMARY KEY (`user_id`) );\
def __init__(self,host,database, user=None, password=None): from tornado.database import Connection as MySQLEngine self._db = MySQLEngine(host,database,user,password)