def get_db():
    global __db__
    if not __db__:
        __db__ = Connection("localhost", "chat", user="******", password=None)
        del __db__._db_args['init_command']
        __db__.reconnect()
    return __db__
示例#2
0
def do_query1(method=None,database=None,sql=None):

    desc = None
    sql = sql.replace('+',' ')
    app.logger.info("aft.sql: %s" % sql)
    app.logger.warning("%s requesting access to %s database" % (
        request.remote_addr, database))
    creds = get_db_creds(database)
    if creds == False:
        return {"ERROR": "Unable to find credentials matching %s." % database}
        abort(404)
    app.logger.debug("Connecting to %s database (%s)" % (
        database, request.remote_addr))
    if '%' in sql:
        sql = sql.replace('%', '%%')

    if method=='json':
        try:
            app.logger.info("%s attempting to run \" %s \" against %s database" % (
                request.remote_addr, sql, database))
            db = Connection(**creds)
            results = db.query(sql)
            db.close()
        except Exception, e:
            results = {"ERROR": ": ".join(str(i) for i in e.args)}
        results = {'result': results}
示例#3
0
def do_query(database=None):
    # Pick up the database credentials
    app.logger.warning("%s requesting access to %s database" % (
        request.remote_addr, database))
    creds = get_db_creds(database)

    # If we couldn't find corresponding credentials, throw a 404
    if creds == False:
        return {"ERROR": "Unable to find credentials matching %s." % database}
        abort(404)

    # Prepare the database connection
    app.logger.debug("Connecting to %s database (%s)" % (
        database, request.remote_addr))
    db = Connection(**creds)

    # See if we received a query
    sql = request.form.get('sql')
    if not sql:
        return {"ERROR": "SQL query missing from request."}

    # If the query has a percent sign, we need to excape it
    if '%' in sql:
        sql = sql.replace('%', '%%')

    # Attempt to run the query
    try:
        app.logger.info("%s attempting to run \" %s \" against %s database" % (
            request.remote_addr, sql, database))
        results = db.query(sql)
    except Exception, e:
        return {"ERROR": ": ".join(str(i) for i in e.args)}
示例#4
0
def do_query(database=None):
    # Pick up the database credentials
    app.logger.warning("%s requesting access to %s database" %
                       (request.remote_addr, database))
    creds = get_db_creds(database)

    # If we couldn't find corresponding credentials, throw a 404
    if creds == False:
        return {"ERROR": "Unable to find credentials matching %s." % database}
        abort(404)

    # Prepare the database connection
    app.logger.debug("Connecting to %s database (%s)" %
                     (database, request.remote_addr))
    db = Connection(**creds)

    # See if we received a query
    sql = request.form.get('sql')
    if not sql:
        return {"ERROR": "SQL query missing from request."}

    # If the query has a percent sign, we need to excape it
    if '%' in sql:
        sql = sql.replace('%', '%%')

    # Attempt to run the query
    try:
        app.logger.info("%s attempting to run \" %s \" against %s database" %
                        (request.remote_addr, sql, database))
        results = db.query(sql)
    except Exception, e:
        return {"ERROR": ": ".join(str(i) for i in e.args)}
示例#5
0
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]',
                                       re.U | re.I)
        self.price_pattern = re.compile(
            ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月',
            re.U | re.I)
        self.area_pattern = re.compile(
            ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)',
            re.U | re.I)
        self.arch_pattern = re.compile(
            ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]',
            re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>',
                                        re.U | re.I)
        self.address_pattern = re.compile(
            ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)
        self.district_pattern = re.compile(
            ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(),
                                           urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600  # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True
    def UserLogin(self, controller, request, done):
        #print request

        # Extract name from the message received
        name = request.user_name
        passwd = self.enc_pass(request.user_pass)
        ret = False
        db = Connection('localhost','xituan-thinkphp','root')
        try:
            query = "select count(*) as cnt from user_login where `user_email` = '%s' and `user_password` = '%s'" % (name,passwd)
            logging.debug(query)
            ret = db.get(query)
        except Exception,e:
            logging.error(e)
    def UserLogin(self, controller, request, done):
        #print request

        # Extract name from the message received
        name = request.user_name
        passwd = self.enc_pass(request.user_pass)
        ret = False
        db = Connection('localhost', 'xituan-thinkphp', 'root')
        try:
            query = "select count(*) as cnt from user_login where `user_email` = '%s' and `user_password` = '%s'" % (
                name, passwd)
            logging.debug(query)
            ret = db.get(query)
        except Exception, e:
            logging.error(e)
示例#8
0
    def __init__(self):
        from d3status.urls import handlers, ui_modules
        from d3status.db import Model

        settings = dict(debug=options.debug,
                        template_path=os.path.join(os.path.dirname(__file__),
                                                   "templates"),
                        static_path=os.path.join(os.path.dirname(__file__),
                                                 "static"),
                        login_url=options.login_url,
                        xsrf_cookies=options.xsrf_cookies,
                        cookie_secret=options.cookie_secret,
                        ui_modules=ui_modules,
                        #autoescape=None,
                        )

        # d3status db connection
        self.db = Connection(host=options.mysql["host"] + ":" +
                                options.mysql["port"],
                             database=options.mysql["database"],
                             user=options.mysql["user"],
                             password=options.mysql["password"],
                             )

        Model.setup_dbs({"db": self.db})

        super(Application, self).__init__(handlers, **settings)
示例#9
0
class Application(tornado.web.Application):
    def __init__(self):


        # refer to db with self.application.db, maintains one db connection

        # cPanel mysql host
        self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db")

        sql = "SELECT name FROM Activity"
        self.trie = Trie()
        results = self.db.query(sql)
        self.activityNames = {}

        trie_words = []
        for result in results:
            trie_words.append(result["name"])
        self.trie.add_token_words(*trie_words)

        # local mysql host
        #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket')  # will later need to change this for heroku

        handlers = [
            tornado.web.URLSpec(r'/', LoginHandler),
            tornado.web.URLSpec(r'/login', LoginHandler),
            tornado.web.URLSpec(r'/logout', LogoutHandler),
            tornado.web.URLSpec(r'/signup', SignupHandler),
            tornado.web.URLSpec(r'/about', AboutHandler),
            tornado.web.URLSpec(r'/activity/new', ActivityHandler),
            tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler),
            tornado.web.URLSpec(r'/home', HomeHandler),
            tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler),
            tornado.web.URLSpec(r'/activity/delete/([0-9]+)', DeleteActivityHandler),
            tornado.web.URLSpec(r'/search', SearchHandler),
            tornado.web.URLSpec(r'/activity/remove/([0-9]+)', DeleteBucketActivityHandler),
            tornado.web.URLSpec(r'/top', TopHandler),
            tornado.web.URLSpec(r'/search/results', SearchResultsHandler),
            tornado.web.URLSpec(r'/activity/complete/([0-9]+)', CompleteActivityHandler),
            tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler),
            tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler),
            tornado.web.URLSpec(r'/mobile/complete', MobileCompleteActivityHandler),
            tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler),
            tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler)
        ]

        current_dir = os.path.dirname(__file__)

        settings = dict(
            template_path=os.path.join(current_dir, 'templates'),
            static_path=os.path.join(current_dir, 'static'),
            debug=options.debug,
            autoescape='xhtml_escape',
            cookie_secret='Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=',
            xsrf_cookies='True'
        )

        super(Application, self).__init__(handlers, **settings)

        logging.info('Server started on port {0}'.format(options.port))
示例#10
0
文件: base.py 项目: barneyzhu/uploadr
    def db(self):
        # Todo: Get from config
        if not hasattr(self, "_db"):
            self._db_connection = Connection(host="localhost", \
                database="uploadr",\
                user="******")

        return self._db_connection
示例#11
0
def connect():
    conn.mysql = Connection(
        host=options.mysql["host"] + ":" + options.mysql["port"],
        database=options.mysql["database"],
        user=options.mysql["user"],
        password=options.mysql["password"])

    # ping db periodically to avoid mysql go away
    PeriodicCallback(_ping_db, int(options.mysql["recycle"]) * 1000).start()
示例#12
0
def index(environment=None, database=None):
    # Pick up the database credentials
    app.logger.warning("%s requesting access to %s db in %s environment" % (request.remote_addr, database, environment))
    creds = get_db_creds(environment, database)

    # If we couldn't find corresponding credentials, throw a 404
    if creds == False:
        abort(404)

    # Connect to the database and run the query
    try:
        app.logger.debug("Connecting to %s db in %s environment (%s)" % (database, environment, request.remote_addr))
        db = Connection(**creds)
    except:
        abort(500)
    try:
        sql = request.form['sql'].replace(r'%',r'%%')
        app.logger.info("%s attempting to run \" %s \" against %s in %s" % (request.remote_addr, sql,database, environment))
        results = db.query(sql)
    except Exception as (errno, errstr):
        return (errno, errstr)
示例#13
0
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I)
        self.price_pattern = re.compile(ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I)
        self.area_pattern = re.compile(ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I)
        self.arch_pattern = re.compile(ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I)
        self.address_pattern = re.compile(ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)
        self.district_pattern = re.compile(ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)]  + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True
示例#14
0
class EightMySQLEngine(Connection):
	def __init__(self,host,database, user=None, password=None):
		from tornado.database import Connection as MySQLEngine
		self._db = MySQLEngine(host,database,user,password)

	def close(self):
		if self._db is not None: self._db.close()
		self._db = None

	def iter(self,query,*parameters):
		return self._db.iter(query,*parameters)

	def query(self,query,*parameters):
		return self._db.query(query,*parameters)

	def get(self,query, *parameters):
		return self._db.get(query,*parameters)

	def execute(self,query,*parameters):
		return self._db.execute(query,*parameters)

	def executemany(self,query,*parameters):
		return self._db.executemany(query,*parameters)
示例#15
0
    def __init__(self):

        # refer to db with self.application.db, maintains one db connection

        # cPanel mysql host
        self.db = Connection(host="engr-cpanel-mysql.engr.illinois.edu",
                             user="******",
                             password="******",
                             database="cubucket_db")

        sql = "SELECT name FROM Activity"
        self.trie = Trie()
        results = self.db.query(sql)
        self.activityNames = {}

        trie_words = []
        for result in results:
            trie_words.append(result["name"])
        self.trie.add_token_words(*trie_words)

        # local mysql host
        #self.db = Connection(host='localhost:3306', user='******', password='', database='cucket')  # will later need to change this for heroku

        handlers = [
            tornado.web.URLSpec(r'/', LoginHandler),
            tornado.web.URLSpec(r'/login', LoginHandler),
            tornado.web.URLSpec(r'/logout', LogoutHandler),
            tornado.web.URLSpec(r'/signup', SignupHandler),
            tornado.web.URLSpec(r'/about', AboutHandler),
            tornado.web.URLSpec(r'/activity/new', ActivityHandler),
            tornado.web.URLSpec(r'/user/([a-zA-Z0-9-_]*)', UserHandler),
            tornado.web.URLSpec(r'/home', HomeHandler),
            tornado.web.URLSpec(r'/activity/add/([0-9]+)', RatingHandler),
            tornado.web.URLSpec(r'/activity/delete/([0-9]+)',
                                DeleteActivityHandler),
            tornado.web.URLSpec(r'/search', SearchHandler),
            tornado.web.URLSpec(r'/activity/remove/([0-9]+)',
                                DeleteBucketActivityHandler),
            tornado.web.URLSpec(r'/top', TopHandler),
            tornado.web.URLSpec(r'/search/results', SearchResultsHandler),
            tornado.web.URLSpec(r'/activity/complete/([0-9]+)',
                                CompleteActivityHandler),
            tornado.web.URLSpec(r'/mobile/login', MobileLoginHandler),
            tornado.web.URLSpec(r'/mobile/bucket', MobileUserBucketHandler),
            tornado.web.URLSpec(r'/mobile/complete',
                                MobileCompleteActivityHandler),
            tornado.web.URLSpec(r'/mobile/add', MobileAddActivityHandler),
            tornado.web.URLSpec(r'/category/([a-zA-Z0-9-_]*)', CategoryHandler)
        ]

        current_dir = os.path.dirname(__file__)

        settings = dict(
            template_path=os.path.join(current_dir, 'templates'),
            static_path=os.path.join(current_dir, 'static'),
            debug=options.debug,
            autoescape='xhtml_escape',
            cookie_secret=
            'Dxj43jWAKSag/JbQTmIbBWvpSlBkazj6YGo0A0mo5tyZkb4sTUvT3UH4GU9SXgFuy=',
            xsrf_cookies='True')

        super(Application, self).__init__(handlers, **settings)

        logging.info('Server started on port {0}'.format(options.port))
示例#16
0
from tornado.database import Connection
from urlparse import urlparse
import sys
# This file is a script that setups the database for cucket if the tables don't already exist

#DATABASE_URL = sys.argv[1]
#url = urlparse(DATABASE_URL)

#db = Connection(host=url.hostname, user=url.username, password=url.password, database=url.path[1:])
db = Connection(host="engr-cpanel-mysql.engr.illinois.edu", user="******", password="******", database="cubucket_db")
#db = Connection(host='localhost:3306', user='******', password='', database='cucket')  # will later need to change this for heroku


# Drop the existing tables
"""Comment this out so that we don't lose data from now on
tables = ['UserCompleted', 'UserInterest', 'Category', 'Activity', 'User']
for table in tables:
    sql = "DROP TABLE IF EXISTS `{0}`".format(table)
    db.execute(sql)
"""


# Create the User table
sql = """CREATE TABLE IF NOT EXISTS User(\
    name varchar(15) NOT NULL PRIMARY KEY,\
    password varchar(100) NOT NULL\
);"""
db.execute(sql)

# Create the Activity table
sql = """CREATE TABLE IF NOT EXISTS Activity(\
示例#17
0
from tornado.options import options
from tornado.database import Connection

from d3status.libs.options import parse_options

parse_options()

from d3status.db import Model
from d3status.db import load_model
from d3status.mail import send_email
from d3status.tasks import status_tasks

# db connection
db = Connection(
    host=options.mysql["host"] + ":" + options.mysql["port"],
    database=options.mysql["database"],
    user=options.mysql["user"],
    password=options.mysql["password"],
)

Model.setup_dbs({"db": db})


def update_server_status():
    url = options.d3_server_status_url
    req = HTTPRequest(url=url)

    client = HTTPClient()
    response = client.fetch(req)
    if response.code == 200:
        status = _parse_server_status(response.body)
        changed_status = load_model("status").update_status(status)
示例#18
0
文件: setup_db.py 项目: cicean/fitquo
    prompt = """\

The script will now try to connect to...
    database:   '%s'
    on host:    '%s'
    using user: '******'

""" % (database, host, user)
    print prompt


host, db, user, password = get_db_credentials()
print_connection_prompt(db, host, user)
sure = raw_input('Are you sure? (yes/no) ')
if sure in ('yes', 'Yes', 'y', 'Y'):
    db = Connection(host=host, database=db, user=user, password=password)
else:
    print "Operation aborted."
    sys.exit(1)

# Drop existing tables
cmd = """\
    DROP TABLE IF EXISTS `RelatesTo`;\
    DROP TABLE IF EXISTS `SpecializesIn`;\
    DROP TABLE IF EXISTS `Interests`;\
    DROP TABLE IF EXISTS `FitnessTopics`;\
    DROP TABLE IF EXISTS `Answer`;\
    DROP TABLE IF EXISTS `Question`;\
    DROP TABLE IF EXISTS `Trainer`;\
    DROP TABLE IF EXISTS `User`;\
"""
示例#19
0
class Robot:
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I)
        self.price_pattern = re.compile(ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I)
        self.area_pattern = re.compile(ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I)
        self.arch_pattern = re.compile(ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I)
        self.address_pattern = re.compile(ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)
        self.district_pattern = re.compile(ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)]  + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True

    def is_valid_url(self, url):
        if len(url) > self.max_url_length:
            return False
        if url.find('#') != -1 or url.find('javascript:') != -1 or url.find('file://') != -1:
            return False
        else:
            return True

    def get_all_links(self, page):
        return self.link_pattern.findall(page)

    def get_price(self, page):
        m = self.price_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_price_length:
            return None
        return m.group(3)

    def get_address(self, page):
        m = self.address_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_address_length:
            return None
        return m.group(3)

    def get_area(self, page):
        m = self.area_pattern.search(page)
        if m == None or len(m.group(4)) > self.max_area_length:
            return None
        return m.group(4)

    def get_arch(self, page):
        m = self.arch_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_arch_length:
            return None
        return m.group(3)

    def get_title(self, page):
        m = self.title_pattern.search(page)
        if m == None or len(m.group(1)) > self.max_title_length:
            return None
        return m.group(1)

    def get_district(self, page):
        m = self.district_pattern.search(page)
        if m == None or len(m.group(5)) > self.max_district_length:
            return None
        return m.group(5)

    def get_date(self, page):
        ts = str(int(time.mktime(datetime.datetime.now().timetuple())))
        return ts

    def analyse(self, page):
        title = self.get_title(page)
        if title == None:
            print 'No title'
            return None
        price = self.get_price(page)
        if price == None:
            print 'No price'
            return None
        area = self.get_area(page)
        if area == None:
            print 'No area'
            return None
        arch = self.get_arch(page)
        if arch == None:
            print 'No arch'
            return None
        address = self.get_address(page)
        if address == None:
            print 'No address'
            return None
        district = self.get_district(page)
        if district == None:
            print 'No district'
            return None
        date = self.get_date(page)
        if date == None:
            print 'Noe date'
            return None

        return [title, price, area, arch, address, district, date]

    def add_page_to_index(self, url, page):
        print 'Adding %s to index...' % url
        result = self.analyse(page)

        if result == None:
            return

        self.add_result_to_db(url, result)

    def add_result_to_db(self, url, result):
        print '...Adding %s to db...' % url
        for i in range(len(result)):
            result[i] = result[i].encode('utf-8')
            if self.debug:
                print result[i]

        title, price, area, arch, address, district, date = result

        try:
            price = int(price)
            area = int(float(area) * 100)
        except:
            print 'price or area may not be a number.'
            return

        dups = self.db.query("select * from pages where url=%s limit 1", url)
        if len(dups) == 1:
            dup = dups[0]
            print dup.title, dup.price, dup.area
            print dup.arch, dup.address, dup.district, dup.date
            
            if price == dup.price and area == dup.area:
                print 'Info already in database.'
                return

        print 'Insert into database...'

        self.db.execute("insert into pages (url, price, address, area, arch, title, district, date) "
                 "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update "
                 "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s",
                 url, price, address, area, arch, title, district, date,
                 price, address, area, arch, title, district, date)

    def get_page(self, url):
        print 'Getting page %s...' % url

        if not url.startswith('http://'):
            print 'URL format error.'
            return None

        try:
            ans = self.opener.open(url).read().decode(self.charset)
        except:
            print 'URL open error.'
            self.current_urlopen_error += 1
            return None

        self.current_urlopen_error = 0
        return ans

    def is_allowed(self, url):
        return self.rerp.is_allowed('*', url)

    def get_full_url(self, parent, url):
        if parent.find('/', 7) == -1:
            parent += '/'
        if url.startswith('http'):
            ans = url
        elif url.startswith('/'):
            ans = parent[:parent.find('/', 7)] + url
        else:
            ans = parent[:parent.rfind('/')] + '/' + url
        return ans

    def crawl_web(self):
        tocrawl = set([self.root])
        crawled = set()

        while len(tocrawl) > 0:
            url = tocrawl.pop()
            if not self.is_allowed(url):
                print 'URL %s is not allowed.' % url
                continue

            crawled.add(url)
            page = self.get_page(url)
            if page == None:
                continue

            links = self.get_all_links(page)

            for link in links:
                #print url, link
                full_link = self.get_full_url(url, link)
                #print full_link
                if self.is_valid_url(full_link) and full_link not in crawled:
                    tocrawl.add(full_link)

            self.add_page_to_index(url, page)

            time.sleep(self.min_delay_seconds)
            
            if self.current_urlopen_error > self.max_allowed_urlopen_error:
                break

            if self.current_urlopen_error > self.max_allowed_urlopen_error / 2:
                time.sleep(10 * 60) # Wait 10 minites
                self.min_delay_seconds *= 1.1
            
        

    def start(self):
        while True:
            print 'BEGIN'
            self.crawl_web()
            print 'END'
            time.sleep(30 * 60) # Wait 30 minites
示例#20
0
 def __init__(self, *args, **kwargs):
     Connection.__init__(self, *args, **kwargs)
     # MySQL connection will go away if no any queries within 8 hours by default
     PeriodicCallback(self._ping_db, 4 * 3600 * 1000).start()
示例#21
0
def connect_db():
    g.db = Connection(config.SPOTIFONT_DB_HOST, config.SPOTIFONT_DB_NAME,
                      config.SPOTIFONT_DB_USER, config.SPOTIFONT_DB_PASSWD)
示例#22
0
class Robot:
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]',
                                       re.U | re.I)
        self.price_pattern = re.compile(
            ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月',
            re.U | re.I)
        self.area_pattern = re.compile(
            ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)',
            re.U | re.I)
        self.arch_pattern = re.compile(
            ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]',
            re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>',
                                        re.U | re.I)
        self.address_pattern = re.compile(
            ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)
        self.district_pattern = re.compile(
            ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(),
                                           urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600  # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True

    def is_valid_url(self, url):
        if len(url) > self.max_url_length:
            return False
        if url.find('#') != -1 or url.find('javascript:') != -1 or url.find(
                'file://') != -1:
            return False
        else:
            return True

    def get_all_links(self, page):
        return self.link_pattern.findall(page)

    def get_price(self, page):
        m = self.price_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_price_length:
            return None
        return m.group(3)

    def get_address(self, page):
        m = self.address_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_address_length:
            return None
        return m.group(3)

    def get_area(self, page):
        m = self.area_pattern.search(page)
        if m == None or len(m.group(4)) > self.max_area_length:
            return None
        return m.group(4)

    def get_arch(self, page):
        m = self.arch_pattern.search(page)
        if m == None or len(m.group(3)) > self.max_arch_length:
            return None
        return m.group(3)

    def get_title(self, page):
        m = self.title_pattern.search(page)
        if m == None or len(m.group(1)) > self.max_title_length:
            return None
        return m.group(1)

    def get_district(self, page):
        m = self.district_pattern.search(page)
        if m == None or len(m.group(5)) > self.max_district_length:
            return None
        return m.group(5)

    def get_date(self, page):
        ts = str(int(time.mktime(datetime.datetime.now().timetuple())))
        return ts

    def analyse(self, page):
        title = self.get_title(page)
        if title == None:
            print 'No title'
            return None
        price = self.get_price(page)
        if price == None:
            print 'No price'
            return None
        area = self.get_area(page)
        if area == None:
            print 'No area'
            return None
        arch = self.get_arch(page)
        if arch == None:
            print 'No arch'
            return None
        address = self.get_address(page)
        if address == None:
            print 'No address'
            return None
        district = self.get_district(page)
        if district == None:
            print 'No district'
            return None
        date = self.get_date(page)
        if date == None:
            print 'Noe date'
            return None

        return [title, price, area, arch, address, district, date]

    def add_page_to_index(self, url, page):
        print 'Adding %s to index...' % url
        result = self.analyse(page)

        if result == None:
            return

        self.add_result_to_db(url, result)

    def add_result_to_db(self, url, result):
        print '...Adding %s to db...' % url
        for i in range(len(result)):
            result[i] = result[i].encode('utf-8')
            if self.debug:
                print result[i]

        title, price, area, arch, address, district, date = result

        try:
            price = int(price)
            area = int(float(area) * 100)
        except:
            print 'price or area may not be a number.'
            return

        dups = self.db.query("select * from pages where url=%s limit 1", url)
        if len(dups) == 1:
            dup = dups[0]
            print dup.title, dup.price, dup.area
            print dup.arch, dup.address, dup.district, dup.date

            if price == dup.price and area == dup.area:
                print 'Info already in database.'
                return

        print 'Insert into database...'

        self.db.execute(
            "insert into pages (url, price, address, area, arch, title, district, date) "
            "values (%s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update "
            "price=%s, address=%s, area=%s, arch=%s, title=%s, district=%s, date=%s",
            url, price, address, area, arch, title, district, date, price,
            address, area, arch, title, district, date)

    def get_page(self, url):
        print 'Getting page %s...' % url

        if not url.startswith('http://'):
            print 'URL format error.'
            return None

        try:
            ans = self.opener.open(url).read().decode(self.charset)
        except:
            print 'URL open error.'
            self.current_urlopen_error += 1
            return None

        self.current_urlopen_error = 0
        return ans

    def is_allowed(self, url):
        return self.rerp.is_allowed('*', url)

    def get_full_url(self, parent, url):
        if parent.find('/', 7) == -1:
            parent += '/'
        if url.startswith('http'):
            ans = url
        elif url.startswith('/'):
            ans = parent[:parent.find('/', 7)] + url
        else:
            ans = parent[:parent.rfind('/')] + '/' + url
        return ans

    def crawl_web(self):
        tocrawl = set([self.root])
        crawled = set()

        while len(tocrawl) > 0:
            url = tocrawl.pop()
            if not self.is_allowed(url):
                print 'URL %s is not allowed.' % url
                continue

            crawled.add(url)
            page = self.get_page(url)
            if page == None:
                continue

            links = self.get_all_links(page)

            for link in links:
                #print url, link
                full_link = self.get_full_url(url, link)
                #print full_link
                if self.is_valid_url(full_link) and full_link not in crawled:
                    tocrawl.add(full_link)

            self.add_page_to_index(url, page)

            time.sleep(self.min_delay_seconds)

            if self.current_urlopen_error > self.max_allowed_urlopen_error:
                break

            if self.current_urlopen_error > self.max_allowed_urlopen_error / 2:
                time.sleep(10 * 60)  # Wait 10 minites
                self.min_delay_seconds *= 1.1

    def start(self):
        while True:
            print 'BEGIN'
            self.crawl_web()
            print 'END'
            time.sleep(30 * 60)  # Wait 30 minites
示例#23
0
def print_connection_prompt(host, database, user):
    prompt = """\

The script will now try to connect to...
    database:   '%s'
    on host:    '%s'
    using user: '******'

""" % (database, host, user)
    print prompt

host, db, user, password = get_db_credentials()
print_connection_prompt(db, host, user)
sure = raw_input('Are you sure? (yes/no) ')
if sure in ('yes', 'Yes', 'y', 'Y'):
    db = Connection(host=host, database=db, user=user, password=password)
else:
    print "Operation aborted."
    sys.exit(1)

# Create User table
cmd = """\
CREATE TABLE `User` (\
  `user_id` INT NOT NULL AUTO_INCREMENT,\
  `user_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `user_email` VARCHAR(50) NULL DEFAULT NULL,\
  `first_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `last_name` VARCHAR(100) NOT NULL DEFAULT 'NULL',\
  `password` VARCHAR(1000) NOT NULL DEFAULT 'NULL',\
  PRIMARY KEY (`user_id`)
);\
示例#24
0
	def __init__(self,host,database, user=None, password=None):
		from tornado.database import Connection as MySQLEngine
		self._db = MySQLEngine(host,database,user,password)