class Nebula: def __init__(self): self.DUMP_DIR = "dump" self.STRUCTURE_DIR = "structure" self.DIR_SEP = "/" self.NEBULA = False hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) self.config = {"host": ip_addr, "user": "******", "password": "******"} self.server_sql = DB(self.config) nebula_config = self.config nebula_config['database'] = "nebula" self.nebula_sql = DB(nebula_config) self.update_main_ip(ip_addr) def update_main_ip(self, host): q = "select * from cloud where type='%s'" % 'main' if (len(self.nebula_sql.select(q)) > 0): q = "update cloud set host='%s' where type='main'" % host self.nebula_sql.update(q) else: q = "insert into cloud(host,username,password,type)values('%s','%s','%s','%s')" % ( host, self.config['user'], self.config['password'], 'main') print q self.nebula_sql.insert(q) def register_sub_server(self, host, username, password): config = {"host": host, "user": username, "password": password} db = DB(config) if db.connection_status(): q = "insert into cloud (host,username,password,type)values('%s','%s','%s','%s')" % ( host, username, password, "sub") self.nebula_sql.insert(q) return True else: return False def initialize_dump(self): # self.stop_thread() for dbname in os.listdir(self.DUMP_DIR): if dbname.endswith('.sql'): dbname = dbname.replace(".sql", "") self.server_sql.create_db(dbname) db_config = self.config db_config['database'] = dbname db = DB(db_config) db.upload_sql(self.DUMP_DIR + self.DIR_SEP + dbname + ".sql") shutil.move(self.DUMP_DIR + self.DIR_SEP + dbname + ".sql", self.STRUCTURE_DIR + self.DIR_SEP) del (db) q = "select * from db where db_name='%s'" % dbname result = self.nebula_sql.select(q) if (len(result) == 0): q = "insert into db (db_name) values('%s')" % dbname id = self.nebula_sql.insert(q) else: id = result[0]['db_id'] q = "INSERT INTO `log`( `cloud_id`, `db_id`) SELECT cloud_id,'%s' from cloud WHERE type = 'main'" % id self.nebula_sql.insert(q) # self.log() def get_config(self, row): if "db_name" in row: config = { "host": row['host'], "user": row['username'], "password": row['password'], "database": row['db_name'], } else: config = { "host": row['host'], "user": row['username'], "password": row['password'], } return config def log(self, cloud_id, db_id): q = "insert into log (cloud_id,db_id) values('%s','%s')" % (cloud_id, db_id) self.nebula_sql.insert(q) def random_host(self, cloud_id): q = "SELECT * FROM cloud where cloud_id <> '%s' ORDER BY RAND() LIMIT 1;" % cloud_id result = self.nebula_sql.select(q) db = DB(self.get_config(result[0])) if (db.connection_status): return result[0] else: print "Connection to %s failed" % result[0]['host'] return False def get_last_host(self, db_id): q = "select * from log inner join cloud using(cloud_id) inner join db using(db_id) where db_id='%s' order by log_id DESC LIMIT 1" % db_id return self.nebula_sql.select(q)[0] # def make_busy(self,database): # q = "update db set db_status = 'busy' where db_name='database' " # self.nebula_sql.query(q) # def make_available(self,database): # q = "update db set db_status = 'available' where db_name='database' " # self.nebula_sql.query(q) def nebula_thread(self, name, delay): while (True): if self.NEBULA: time.sleep(delay) self.initialize_dump() q = "select * from db" result = self.nebula_sql.select(q) for row in result: last_host = self.get_last_host(row['db_id']) print last_host if last_host: new_host = self.random_host(last_host['cloud_id']) tf = Transfer( self.get_config(last_host), self.get_config(new_host), self.STRUCTURE_DIR + self.DIR_SEP + row['db_name'] + ".sql") # self.make_busy(row['db_name']) tf.transfer(row['db_name']) self.log(new_host['cloud_id'], row['db_id']) # self.make_available(row['db_name']) # print time.ctime(time.time()) else: break return def start_nebula(self): self.NEBULA = True try: thread.start_new_thread(self.nebula_thread, ("Nebula", 300)) except Exception, e: pass
class Transfer: def __init__(self, from_conf, to_conf, db_sql_path): self.from_db = DB(from_conf) self.to_db = DB(to_conf) self.sql_array = [] sql_file = open(db_sql_path, 'r') q = "" for line in sql_file: line = line.strip() if line != "" and not line.startswith("--"): q += " " + line if q.endswith(";"): self.sql_array.append(q) q = "" def from_db_tables(self): q = "show tables" return self.from_db.select(q) def create_db_to_db(self, database): q = "DROP DATABASE IF EXISTS `%s`" % database self.to_db.query(q) q = "CREATE DATABASE %s" % database self.to_db.query(q) self.use_db_to_db(database) def use_db_to_db(self, database): q = "use %s" % database self.to_db.query(q) def drop_to_db(self, table_name): q = "DROP TABLE IF EXISTS `%s`" % table_name self.to_db.query(q) def create_table_to_db(self, database, table_name): create_q = "" alter_q = "" primary_q = "" for q in self.sql_array: search = "CREATE TABLE `%s`" % table_name if q.find(search) > -1: create_q = q break for q in self.sql_array: alter = "ALTER TABLE `%s`" % table_name if q.find(alter) > -1: if q.find(table_name) > -1: if q.find("PRIMARY KEY") > -1: primary_q = q break for q in self.sql_array: alter = "ALTER TABLE `%s`" % table_name if q.find(alter) > -1: if q.find(table_name) > -1: if q.find("AUTO_INCREMENT") > -1: alter_q = q break auto_inc = "SELECT `AUTO_INCREMENT` FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table_name + "'" auto_inc_val = self.from_db.select(auto_inc)[0]['AUTO_INCREMENT'] alter_q = re.sub('[AUTO_INCREMENT=]{15}[0-9]{0,10}', "AUTO_INCREMENT=%s" % auto_inc_val, alter_q) # print create_q # print primary_q # print alter_q self.to_db.query(create_q) self.to_db.query(primary_q) self.to_db.query(alter_q) def copy_table_data(self, table_name): q = "select * from %s" % table_name result = self.from_db.select(q) for row in result: values = [] for key in row.keys(): values.append(str(row[key])) values = "','".join(values) # q = "insert into " + str(table_name) + " " + str(tuple(row.keys())).replace("'","`").replace("u`","`") + "values" + str(tuple(row.values())).replace("u'","'") q = "insert into " + str(table_name) + " " + str(tuple( row.keys())).replace("'", "`").replace( "u`", "`") + " " + "values ('" + values + "')" # print q self.to_db.insert(q) def drop_db_from_db(self, database): q = "drop database %s" % database self.from_db.query(q) def transfer(self, database): self.create_db_to_db(database) result = self.from_db_tables() for table in result: table_name = table['Tables_in_' + database] self.drop_to_db(table_name) self.create_table_to_db(database, table_name) self.copy_table_data(table_name) self.drop_db_from_db(database)
class Parser: def __init__(self): self.list = list() self.comments = [] # 评论结果 self.comments_userurl = [] # 从评论里面爬取到的用户url self.likes = [] # 赞爬取结果 self.queue = Queue() # 爬取队列对象 self.db = DB() def parse_numberOfTweet(self, html): # 获取每一页微博的url dic = {} soup = BeautifulSoup(html, 'lxml') base_url = soup.find('td').contents[0]['href'] url = re.search('/[\d*]{1,}', base_url).group() + "?page=" base_page = soup.find('span', attrs={"class": "tc"}).text page_amount = int( re.search('[\d]{1,}', base_page, re.M | re.I).group()) / 10 + 1 dic["url"] = "https://weibo.cn/u" + url dic["page_amount"] = page_amount self.queue.add_url_tweets(dic) # TODO:解析整个页面 def parse_fpage(self, html): # 获取评论页面url soup = BeautifulSoup(html, 'lxml') for i in soup.find_all('a', attrs={"class": "cc"}): if "原文评论" in i.text: pass else: dic = {} dic["url"] = re.search('https://[a-z]{1,}.cn/comment/.*[?]', i["href"], re.M | re.I).group() + "page=" num = int(re.search('[\d]+', i.text).group()) if num == 0: pass else: dic["page_amount"] = num / 10 + 1 print(dic) self.queue.add_url_comments(dic) # TODO:解析评论页面 def parse_comments(self, html): # 获取评论页面的内容 soup = BeautifulSoup(html, "lxml") self.comments = [] # 将评论list清空 self.comments_userurl = [] # 把用户urllist清空 like_dic = {} comments_html = soup.find_all('div', id=re.compile("C_")) host_name = soup.find("a", attrs={"class": ""}).text[2:-3] like_soup = soup.find('a', attrs={'href': re.compile('/atti')}) like_dic['url'] = "https://weibo.cn" + str( re.search("/attitu[\w]{1,}/[\w]{1,}\?", like_soup['href']).group()) + "page=" like_dic['page_amount'] = int( re.search('[\d]+', like_soup.text).group()) / 10 + 1 print(like_dic) self.queue.add_url_likes(like_dic) for content in comments_html: comment_dict = dict() comment_dict["comment_content"] = content.find(attrs={ "class": "ctt" }).text # 获取评论内容 comment_dict["comment_username"] = content.find( "a").text # 获取评论者的昵称 comment_user_url = content.find("a").attrs["href"] # 获取评论者的链接 self.queue.add_url_host("https://weibo.cn" + comment_user_url) # 取评论用户url做为爬取队列 print(comment_dict, comment_user_url) self.comments.append(comment_dict) self.db.insert(host_name, self.comments) # 插入数据库 # TODO:解析点赞页面 def parse_likes(self, html): # 获取点赞页面的内容 self.likes = [] likes = [] soup = BeautifulSoup(html, "lxml") host_name = soup.find("div", attrs={"class": ""}).text.split(":")[0] n = 0 for i in soup.find_all(attrs={"class": "ct"}): n += 1 if n == 1: continue else: likes.append(i.previous_sibling.previous_sibling.text) self.db.insert_likes(host_name, likes)