def write_log(deploy_task_info, log): submit_uuid = deploy_task_info[0]["submit_uuid"] host_ip = deploy_task_info[0]["host_ip"] port = deploy_task_info[0]["port"] insert_sql = "insert into deploy_mysql_instance_log(submit_uuid,host_ip,port,deploy_log) values('{}','{}','{}','{}')".format( submit_uuid, host_ip, port, log) DbHelper.dml(insert_sql)
def report_mysql_port(host_ip): # 从正在运行端口获取 running_port_list = [] pids = [pid for pid in os.listdir('/proc') if pid.isdigit()] for pid in pids: try: process_cmd_info_list = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read().split(b'\0') match_cmd = process_cmd_info_list[0].decode('utf-8') if re.findall('(mysqld)', match_cmd) and re.findall( '(--port)', process_cmd_info_list[-2].decode('utf-8')): running_port = process_cmd_info_list[-2].decode('utf-8').split( '=')[-1] if running_port not in running_port_list: running_port_list.append(int(running_port)) except Exception as e: print(e) #不需要打印到日志,因为有些进程号是瞬间的,会误导 for running_port in running_port_list: sql = "replace into deployed_mysql_port(host_ip,port) values('{}',{})".format( host_ip, running_port) DbHelper.dml(sql) # 从安装目录获取 port_list = [3306, 3307, 3308, 3309, 3310, 3311, 3312, 3313, 3314, 3315] for check_port in port_list: if os.path.exists('/data/{}'.format(check_port)) or os.path.exists( '/data/mysql/multi/{}'.format(check_port)): sql = "replace into deployed_mysql_port(host_ip,port) values('{}',{})".format( host_ip, check_port) DbHelper.dml(sql)
class CommentsPipeline(object): def open_spider(self, spider): self.data = [] self.dbUtils = DbHelper() self.logger = get_logger(self.__class__.__name__) def close_spider(self, spider): try: if len(self.data) > 0: latest_comment_id = self.__get_latest_comment_id( self.data[0]['news_id']) # Handle data (update, filter, clean) columns = ['comment_id', 'news_id', 'comment_time', 'comment'] df = pd.DataFrame(self.data, columns=columns) df = df.dropna().drop_duplicates().query( f'comment_id > {latest_comment_id}') df['sentiment'] = df['comment'].apply(self.__sentiment) data = [ Comments(comment=item['comment'], news_id=item['news_id'], comment_id=item['comment_id'], comment_time=item['comment_time'].to_pydatetime(), sentiment=item['sentiment']) for item in df.to_dict('records') ] self.dbUtils.insert(data) except Exception as ex: self.logger.error( "Exception occurred handling data when spider is closed.", ex) def process_item(self, item, spider): if item: self.data.append(dict(item)) return item def __get_latest_comment_id(self, news_id): session = self.dbUtils.Session() try: res = session.query(func.max( Comments.comment_id)).filter(Comments.news_id == news_id) return res[0][0] if res[0] and res[0][0] and res[0][0] > 0 else 0 except Exception as ex: raise ex finally: session.close() def __sentiment(self, text): return SnowNLP(text).sentiments
class SentimentService: def __init__(self): self.db_helper = DbHelper() def __init_data(self): session = self.db_helper.Session() query = session.query(Sentiment) ret = self.db_helper.query(query, page_size=1).count() if ret == 0: print("No data found, loading data...") # insert data items = [] for data_item in self.__load_data(): item_name = '嫌疑人' sentiment = self.__sentiment(data_item['comment']) item = Sentiment(item_name=item_name, score=data_item['score'], trend=data_item['trend'], comment=data_item['comment'], sentiment=sentiment) items.append(item) session.add_all(items) session.commit() session.close() def top_sentiments(self, page_size=10): self.__init_data() session = self.db_helper.Session() query = session.query(Sentiment).filter( Sentiment.sentiment < 0.99).order_by(Sentiment.sentiment.desc()) results = self.db_helper.query(query, page_size=page_size) session.close() return [result.to_dict() for result in results] def __sentiment(self, text): return SnowNLP(text).sentiments def __load_data(self): backend_dir = Path(os.path.dirname(os.path.abspath(__file__))).parent source = backend_dir.joinpath("resources/data.csv") df = pd.read_csv(source) return df.to_dict('records')
def get_task_info(host_ip): # 开始获取任务 deploy_info_sql = "select submit_uuid,host_ip,port,deploy_status,deploy_archit,deploy_env,deploy_other_param from deploy_mysql_instance where host_ip='{}' and deploy_status=0 and timestampdiff(second,ctime,now())<86400 limit 1".format( host_ip) ret = DbHelper.find_all(deploy_info_sql) if ret['status'] != "ok": return False elif len(ret['data']) == 0: return False else: deploy_task_info = ret['data'] log = "获取到部署任务" write_log(deploy_task_info, log) package_info_sql = "select pacakage_url,package_name,package_md5 from deploy_package_info where package_name='nucc_mysql.tar.gz'" ret = DbHelper.find_all(package_info_sql) if ret['status'] == "ok": if len(ret['data']) <= 0: raise "没有获取到部安装包" package_info = ret['data'] return { "task": "yes", "deploy_task_info": deploy_task_info, 'package_info': package_info }
def update_status(deploy_task_info, deploy_status): submit_uuid = deploy_task_info[0]["submit_uuid"] update_sql = "update deploy_mysql_instance set deploy_status={} where submit_uuid='{}'".format( deploy_status, submit_uuid) if deploy_status == 3: DbHelper.dml(update_sql) elif deploy_status == 2: DbHelper.dml(update_sql) elif deploy_status == 1: DbHelper.dml(update_sql)
#!/usr/bin/env python # -*- coding: utf-8 -*- # gaochao import time import os import collections import platform as pf import psutil as ps import pymysql as db import logging logger = logging.getLogger('agent_logger') from utils.db_helper import DbHelper db_op_obj = DbHelper() def conn_mysql_instance(host, port, user, password, database): try: return db.connect(host=host, port=port, user=user, passwd=password, db=database, charset='utf8mb4', cursorclass=db.cursors.DictCursor) except Exception as e: raise Exception('Can not build available connection!' + e) return None def domain_is_valid(domain): if '.' in domain:
def __init__(self): self.db_helper = DbHelper()
class NewsCommentsSpider(scrapy.Spider): url = os.getenv("NEWS_URL") comments_url = os.getenv("COMMENTS_URL") name = "news_comments" start_urls = [url] comments_per_page = int(os.getenv("COMMENTS_PER_PAGE", "20")) dbUtils = DbHelper() logger = get_logger('news_comments_spider') max_pages = int(os.getenv("MAX_PAGES", "10")) def parse(self, response): try: news_name = response.xpath( "//div[@id='wrapper']/h1[1]/span[1]/text()").extract_first( ).strip() news_id = Helper.md5(news_name) self.__add_news( News(news_id=news_id, news_name=news_name, source=self.url)) item = CommentsItem() item['news_id'] = news_id total_comments = int( re.findall( r'\d+', response.xpath( "//div[@id='content']/div/div[@class='article']/div[@class='related_info']/div[@class='mod-hd']/h2[1]/span[@class='pl']/a/text()" ).extract_first().strip())[0]) pages = int( total_comments / self.comments_per_page ) if total_comments % self.comments_per_page == 0 else int( total_comments / self.comments_per_page) + 1 # Get all comments in pages, but crawl up to max_pages if pages > self.max_pages: pages = self.max_pages urls = [f'{self.comments_url}?p={p+1}' for p in range(pages)] for c_url in urls: yield scrapy.Request(c_url, meta={'item': item}, callback=self.__parse_comments) except Exception as ex: self.logger.error( f"Exception occurred when parsing page {self.url}", ex) def __parse_comments(self, response): for sel in response.xpath("//div[@id='comments']/ul/li"): try: item = response.meta['item'] item['comment_id'] = int( sel.xpath("@data-cid").extract_first().strip()) item['comment'] = sel.xpath( "div[@class='comment']/p[1]/span[1]//text()" ).extract_first().strip() item['comment_time'] = Helper.parse_comment_time( sel.xpath( "div[@class='comment']/h3[1]/span[2]/span[2]/text()"). extract_first().strip()) yield item except Exception as ex: self.logger.error( f"Exception occurred when parsing comment with response {response}", ex) yield None def __add_news(self, news_item): if self.__check_news(news_item) == 0: self.dbUtils.insert([news_item]) def __check_news(self, news_item): session = self.dbUtils.Session() try: result = session.query(News).filter( News.news_id == news_item.news_id).count() return result except Exception as ex: raise ex finally: session.close()
def __init__(self): self.db_helper = DbHelper() self.news_id = self.__get_news_id() self.logger = get_logger(self.__class__.__name__)
class NewsCommentsService: DEFAULT_PAGE_SIZE = int(os.getenv("PAGE_SIZE", "25")) DEFAULT_POSITIVE_THRESHOLD = float(os.getenv("POSITIVE_THRESHOLD", "0.6")) def __init__(self): self.db_helper = DbHelper() self.news_id = self.__get_news_id() self.logger = get_logger(self.__class__.__name__) def __get_news_id(self): session = self.db_helper.Session() try: res = session.query(News.news_id).first() return res[0] except Exception as ex: raise ex finally: session.close() def search_comments(self, q, page=0, page_size=DEFAULT_PAGE_SIZE, startdate=None, enddate=None): session = self.db_helper.Session() try: news = session.query(News).filter( News.news_id == self.news_id).first() query = session.query(Comments).filter( Comments.news_id == self.news_id).filter( Comments.comment.contains(q)) t_query = session.query(func.count('*').label('total')).filter( Comments.news_id == self.news_id).filter( Comments.comment.contains(q)) if startdate and startdate != '': query = query.filter( Comments.comment_time >= Helper.get_date(startdate)) t_query = t_query.filter( Comments.comment_time >= Helper.get_date(startdate)) if enddate and enddate != '': query = query.filter( Comments.comment_time <= Helper.get_date(enddate)) t_query = t_query.filter( Comments.comment_time <= Helper.get_date(enddate)) query = query.order_by(Comments.comment_time.desc()) results = self.db_helper.query(query, page=page, page_size=page_size) total_comments = int(t_query[0][0]) pages = int(total_comments / page_size) if total_comments % page_size == 0 else int( total_comments / page_size) + 1 return { 'dates': { 'start': (Helper.get_date(startdate).strftime('%Y-%m-%d') if startdate and startdate != '' else ''), 'end': (Helper.get_date(enddate).strftime('%Y-%m-%d') if enddate and enddate != '' else '') }, 'pages': pages, 'comments': [result.to_dict() for result in results], 'news': news.to_dict() } except Exception as ex: self.logger.error("Exception occurred when searching comments. ", ex) return {'pages': 0, 'comments': []} finally: session.close() def get_data(self, page=0, page_size=DEFAULT_PAGE_SIZE): session = self.db_helper.Session() try: news = session.query(News).filter( News.news_id == self.news_id).first() comment_query = session.query(Comments).filter( Comments.news_id == self.news_id) comments = self.db_helper.query(comment_query, page=page, page_size=page_size).order_by( Comments.comment_id.desc()) comment_nums = session.query( Comments.comment_time, func.count('*').label('comments_num')).filter( Comments.news_id == self.news_id).group_by( Comments.comment_time) dates = [result[0].strftime("%Y-%m-%d") for result in comment_nums] total_comments = int( session.query(func.count('*').label('total')).filter( Comments.news_id == self.news_id)[0][0]) pages = int(total_comments / page_size) if total_comments % page_size == 0 else int( total_comments / page_size) + 1 return { 'news': news.to_dict(), 'dates': dates, 'comments': [comment.to_dict() for comment in comments], 'comment_nums': [{ 'date': result[0].strftime("%Y-%m-%d"), 'count': result[1] } for result in comment_nums], 'pages': pages } except Exception as ex: self.logger.error("Exception occurred when getting data. ", ex) return { 'news': {}, 'dates': [], 'comments': [], 'comment_nums': [] } finally: session.close() def get_data_by_date(self, page=0, page_size=DEFAULT_PAGE_SIZE, datestr=None): session = self.db_helper.Session() try: news = session.query(News).filter( News.news_id == self.news_id).first() comment_query = session.query(Comments).filter( Comments.news_id == self.news_id).filter( Comments.comment_time == Helper.get_date( datestr)).order_by(Comments.comment_id.desc()) comments = self.db_helper.query(comment_query, page=page, page_size=page_size) sentiment_nums = session.query( Comments.comment_time, func.sum( sql.case([(sql.column('sentiment') >= self.DEFAULT_POSITIVE_THRESHOLD, 1)], else_=0)).label('positive'), func.sum( sql.case([(sql.column('sentiment') < self.DEFAULT_POSITIVE_THRESHOLD, 1)], else_=0)).label('negative') ).filter(Comments.news_id == self.news_id).filter( Comments.comment_time == Helper.get_date(datestr)).group_by( Comments.comment_time) total_comments = int( session.query(func.count('*').label('total')). filter(Comments.news_id == self.news_id).filter( Comments.comment_time == Helper.get_date(datestr))[0][0]) session.close() pages = int(total_comments / page_size) if total_comments % page_size == 0 else int( total_comments / page_size) + 1 return { 'news': news.to_dict(), 'comments': [comment.to_dict() for comment in comments], 'date': sentiment_nums[0][0].strftime("%Y-%m-%d"), 'positive': int(sentiment_nums[0][1]), 'negative': int(sentiment_nums[0][2]), 'total': int(sentiment_nums[0][1]) + int(sentiment_nums[0][2]), 'pages': pages } except Exception as ex: self.logger.error("Exception occurred when getting data by date. ", ex) return { 'news': {}, 'comments': [], 'date': '', 'positive': 0, 'negative': 0, 'total': 0 } finally: session.close()
def open_spider(self, spider): self.data = [] self.dbUtils = DbHelper() self.logger = get_logger(self.__class__.__name__)