def consumer_offset(topic, time_str): ''' 获取指定topic指定时间的offset数据,下一步用于指定时间回补数据 :param time_str: format is %Y-%m-%d %H:%M:%S :return: ''' if type(time_str) == type(1) or type(time_str) == type(1.0): timestamp_sec = time_str - 60 else: timestamp_sec = time.mktime( time.strptime(time_str, '%Y-%m-%d %H:%M:%S')) - 60 time_begin = time.strftime("%Y-%m-%d %H:%M", time.localtime(timestamp_sec)) + ":00" time_end = time.strftime("%Y-%m-%d %H:%M", time.localtime(timestamp_sec + 60)) + ":00" client = MysqlClient("saas_server") con, cur = client.connection sql_format = "SELECT topic, tm, MAX(offset) FROM saas_server.product_record WHERE topic = '%(topic)s' AND (tm >= '%(time_begin)s' AND tm < '%(time_end)s') GROUP BY topic, tm" sql = sql_format % { "topic": topic, "time_begin": time_begin, "time_end": time_end } cur.execute(sql) for item in cur.fetchall(): topic, tm, offset = item[0], item[1], item[2] client.closeMysql() return offset + 1
class Friendmaps_to_list: def __init__(self): self.mc = MysqlClient() def query_map_to_dict(self, table_name): sys.stdout.write(u'开始查询表%s。\n' % table_name) sqlimit = 50 # 每次查询条数 sqlstart = 0 # 每次查询开始条数,每次加sqlimit dict_all = {} table_rows = self.query_table_rows(table_name) while table_rows >= sqlstart: sql_select = "select v from %s limit %d,%d;" % (table_name, sqlstart, sqlimit) rs = self.mc.query(sql_select) for r in rs: print len(r[0]) dict_loc = eval(r[0]) for k, v in dict_loc.iteritems(): if k in dict_all: # 合并个人关系字典 for k1, v1 in v.iteritems(): if k1 in dict_all[k]: dict_all[k][k1] += v1 else: dict_all[k][k1] = 1 else: dict_all[k] = v sqlstart += sqlimit sys.stdout.write(u'返回结果字典。\n') sys.stdout.write(str(len(dict_all))) return dict_all def query_table_rows(self, table_name): sql_rows = "select count(*) from %s;" % (table_name) rs = self.mc.query(sql_rows) table_rows = rs[0][0] sys.stdout.write("Table %s has %d rows." % (table_name, table_rows)) return table_rows def insert_to_list(self, dict_all, table_name): # 全部查询到后每个人的关系字典才具备完整性,才可以开始插入。 # 问题是内存不足,dict_all 太大。 # 对每一行原始记录,遍历每个个人字典,每个人工号去目标表中查询,若查询到已有的此人行,合并字典(MySQL并做不到?) # TODO: 消费表内存错误 sys.stdout.write(u'开始插入到表%s。\n' % table_name) for k, v in dict_all.iteritems(): v = str(v).replace("'", "\\'") sql_insert = "insert into %s(user_id, dict_relation) values('%s', '%s'); " % (table_name, k, v) self.mc.query(sql_insert) sys.stdout.write(u'表%s插入完成。\n' % table_name)
class VerifyProxy(object): def __init__(self): self.mysql = MysqlClient() def verify_proxy(self, scheme, ip, port): """ 使用百度测试代理的连通性,并返回响应时长(单位:ms) :param scheme: :param ip: :param port: :return: """ proxies = { scheme: scheme + '://' + ip + ':' + port + '/' } response_time = 0 status = '0' try: response = requests.get(scheme + '://www.baidu.com/get', proxies=proxies, headers = header) if response: response_time = round(response.elapsed.total_seconds() * 1000) status = '1' print(status) else: response_time = 0 status = '0' except: pass return ({'response_time':response_time,'status':status}) def verify_all(self): """ 验证住方法,从数据库中获取所有代理进行验证 :return: """ results = self.mysql.find_all() for result in results: res = self.verify_proxy(result[1], result[2], result[3]) proxy = { "id": result[0], "scheme": result[1], "ip": result[2], "port": result[3], "status": res["status"], "response_time": res["response_time"], } self.mysql.update_proxy(proxy) print('代理验证成功')
def insert_d_log_collector(default_groupid=("JHSAASGroup_1", ), execute_groupid=(6, )): # 获取新增加的appkey sql_appkeys = "SELECT appkey, plat FROM saas_server.d_appkey WHERE enable = 1 AND appkey NOT IN(SELECT DISTINCT(appkey) FROM saas_server.d_log_collector) AND plat IN ('android', 'ios', 'h5')" # 获取汇总的 group_id sql_groupids = "SELECT DISTINCT(group_id) FROM saas_server.d_log_collector" sql_insert_format = "INSERT INTO saas_server.d_log_collector (insert_tm , group_id, client_id, appkey, plat, logpath, enable) VALUES(NOW(), '%(group_id)s', '%(client_id)s', '%(appkey)s', '%(plat)s', '%(logpath)s', 1)" groupids = set() client = MysqlClient("saas_server") con, cur = client.connection cur.execute(sql_groupids) for item in cur.fetchall(): groupid = item[0] groupids.add(groupid) cur.execute(sql_appkeys) for appkey, plat in [item for item in cur.fetchall()]: for groupid in groupids: if groupid in execute_groupid: continue sql_insert = sql_insert_format % { "group_id": groupid, "client_id": '', "appkey": appkey, "plat": plat, "logpath": plat, } print(sql_insert) cur.execute(sql_insert) con.commit() client.closeMysql()
class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_free_proxy(self): n = int(input("请输入需爬取的页数:")) for i in range(0, n): url = f'https://ip.jiangxianli.com/?page={i}' response = requests.get(url).text html = etree.HTML(response) content = html.xpath('//*[@class="layui-table"]/tbody/tr') time.sleep(1) for j in content: scheme = j.xpath('./td[4]/text()')[0].lower() ip = j.xpath('./td[1]/text()')[0] port = j.xpath('./td[2]/text()')[0] Anonymous_degrees = j.xpath('./td[3]/text()')[0] verify_result = self.verify.verify_proxy( scheme, ip, port, Anonymous_degrees) if verify_result['status'] == '1': proxy = { "scheme": scheme, "ip": ip, "port": port, "Anonymous_degrees": Anonymous_degrees, "status": verify_result["status"], "response_time": verify_result["response_time"] } self.mysql.add_proxy(proxy) print(f"代理{ip}链接测试已通过,已保存Mysql") else: print(f'代理{ip}链接测试未通过')
def get_appkeys(self, ty): datatype_list = [] # collectorsaas if ty == "transformsaaslogs": try: from MysqlClient import MysqlClient client = MysqlClient("saas_meta") result = client.getAppkey() datatype_list = [item[1] for item in result if item[2] != "h5"] client.closeMysql() except: import traceback print traceback.print_exc() datatype_list.append( "hbtv") if "hbtv" not in datatype_list else None # collectorsaash5 elif ty == "transformh5": try: from MysqlClient import MysqlClient client = MysqlClient("saas_meta") # result = client.getAppkey() # datatype_list = [item[1] for item in result if item[2] == "h5"] result = client.getAppkey_H5() datatype_list = [item["appkey"] for item in result] client.closeMysql() except: import traceback print traceback.print_exc() # collectorh5 elif ty == "feeling_H5": datatype_list.append("feeling_H5") elif ty == "transformNew": datatype_list += ['guaeng', 'guagua', 'feeling'] return datatype_list
class FriendMap: def __init__(self, tableName, idItem, fixedItem, orderBy, maxRange): self.tableName = tableName self.searchItem = (idItem, fixedItem, orderBy) self.maxRange = maxRange self.mc = MysqlClient() self.prepare_environment() def prepare_environment(self): print 'Calculating the total work...' self.totalNum = self.mc.query('select count(*) from %s'%self.tableName)[0][0] self.restructTableName = 'restructed_%s'%self.tableName self.pr = ProcessRecorder(processName = self.tableName, localDataSet = {'nameDict':{}, 'currentPlace':'', 'idListFindingFriends':[]}, total = self.totalNum, warningMessage = 'Calculating friend map of %s'%self.tableName) try: self.mc.restruct_table(self.tableName, [('node_des',''), ('ac_datetime','')], self.restructTableName) except: print 'Storage has been restructed to %s'%self.restructTableName self.mc.query('create table if not exists %s_friendmap (k text, v longtext)'%self.tableName[:3]) def calculate(self): try: dataSource = self.mc.data_source('select * from %s limit %s,100'%(self.restructTableName, self.pr.count)) def add_friend_point(personA, personB): if not self.pr.localDataSet['nameDict'].has_key(personA): self.pr.localDataSet['nameDict'][personA] = {} if not self.pr.localDataSet['nameDict'][personA].has_key(personB): self.pr.localDataSet['nameDict'][personA][personB] = 0 self.pr.localDataSet['nameDict'][personA][personB] += 1 while 1: data = dataSource() if data is None: break self.pr.add() if self.pr.localDataSet['currentPlace'] != data[2]: if self.pr.localDataSet['currentPlace'] != '' and self.pr.localDataSet['nameDict']: self.mc.insert_data('%s_friendmap'%self.tableName[:3], items = [self.pr.localDataSet['currentPlace'], MySQLdb.escape_string(json.dumps(self.pr.localDataSet['nameDict']))]) self.pr.localDataSet['currentPlace'] = data[2] self.pr.localDataSet['nameDict'] = {} self.pr.localDataSet['idListFindingFriends'] = [] for iff in self.pr.localDataSet['idListFindingFriends']: if iff[1] - data[1] <= datetime.timedelta(0, self.maxRange): add_friend_point(data[0], iff[0]) add_friend_point(iff[0], data[0]) else: del self.pr.localDataSet['idListFindingFriends'][0] self.pr.localDataSet['idListFindingFriends'].append(data) except: # store the current process print '\nProcess stopped when processing %s'%self.pr.localDataSet['currentPlace'] traceback.print_exc() self.pr.store_process() if data is None: self.mc.insert_data('%s_friendmap'%self.tableName[:3], items = [self.pr.localDataSet['currentPlace'], MySQLdb.escape_string(json.dumps(self.pr.localDataSet['nameDict']))]) print '\nProcessing Finished'
def getAppkey(): datatype_list = [] try: from MysqlClient import MysqlClient client = MysqlClient("saas_meta") result = client.getAppkey_app() datatype_list = [item["appkey"] for item in result] client.closeMysql() except: import traceback print traceback.print_exc() datatype_list.append("hbtv") if "hbtv" not in datatype_list else None return datatype_list
def getAppkey(): datatype_list = [] try: from MysqlClient import MysqlClient client = MysqlClient("saas_meta") # result = client.getAppkey() # datatype_list = [item[1] for item in result if item[2] == "h5"] result = client.getAppkey_H5() datatype_list = [item["appkey"] for item in result] client.closeMysql() except: import traceback print traceback.print_exc() return datatype_list
class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_page(self, url, charset): response = requests.get(url, headers=header) response.encoding = charset return response.text def crawl_ip(self, page_num=3): """ 获取代理 ip :param page_num: :return: """ proxy = [] start_url = 'https://www.kuaidaili.com/free/inha/{}/' urls = [start_url.format(page) for page in range(141, page_num + 1)] for url in urls: print('crawl:', url) html = self.get_page(url, 'gb2312') if html: d = PyQuery(html) trs = d('table tbody tr').items() for tr in trs: scheme = tr.find('td:nth-child(4)').text().lower() ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() print(scheme, ip, port) verify_result = self.verify.verify_proxy(scheme, ip, port) if verify_result["status"] == '1': proxy = { "scheme": scheme, "ip": ip, "port": port, "status": verify_result["status"], "response_time": verify_result["response_time"], } # 存入数据库 self.mysql.add_proxy(proxy) print('代理', ip, '连通测试已通过,已保存 Mysql') else: print('代理', ip, '连通测试未通过')
def producter_record(infos): ''' 解析数据记录入库 :param infos: :return: ''' config = GetConfigure() product_id = config.get("kafka", "product_id") client = MysqlClient("saas_server") con, cur = client.connection sql_format = "INSERT INTO saas_server.product_record (product_id, topic, tm, offset) VALUES (%s, %s, %s, %s)" for key in infos: topic = key values = infos[key] # tm = datetime.datetime.fromtimestamp(values["timestamp"]).strftime("%Y-%m-%d %H:%M:%S") tm = datetime.datetime.fromtimestamp(values["timestamp"]) offset = values["offset"] cur.execute(sql_format, (product_id, topic, tm, offset)) con.commit() client.closeMysql()
def insert_demojr(): client = MysqlClient() con, cur = client.connection earliest_date = (datetime.datetime.now() - datetime.timedelta(days=90)).strftime("%Y-%m-%d") # today_date = (datetime.datetime.now()).strftime("%Y-%m-%d") cur.execute( "drop table if exists jhd_demo.jhd_demojr_ios_charge_overall_custom") con.commit() # cur.execute("create table jhd_demo.jhd_demojr_ios_charge_overall_custom select * from ncf.ncf_360_h5_charge_overall_custom where tm >= '%s'" % earliest_date) cur.execute( "create table jhd_demo.jhd_demojr_ios_charge_overall_custom select " "tm, pub, ver, " "uv+FLOOR(uv * RAND()) uv, " "reg+FLOOR(reg * RAND()) reg, " "charge+FLOOR(charge * RAND()) charge, " "today_reg+FLOOR(today_reg * RAND()) today_reg, " "today_charge+FLOOR(today_charge * RAND()) today_charge, " "first_charge+FLOOR(first_charge * RAND()) first_charge, " "reg_charge+FLOOR(reg_charge * RAND()) reg_charge, " "recharge+FLOOR(recharge * RAND()) recharge, " "charge_7days+FLOOR(charge_7days * RAND()) charge_7days, " "checkfin+FLOOR(checkfin * RAND()) checkfin, " "auth+FLOOR(auth * RAND()) auth, " "cert+FLOOR(cert * RAND()) cert, " "bankcard+FLOOR(bankcard * RAND()) bankcard, " "charge_newer+FLOOR(charge_newer * RAND()) charge_newer, " "dig+FLOOR(dig * RAND()) dig, " "dig_uc+FLOOR(dig_uc * RAND()) dig_uc, " "dig_homepage+FLOOR(dig_homepage * RAND()) dig_homepage, " "dig_jump+FLOOR(dig_jump * RAND()) dig_jump, " "dig_cur+FLOOR(dig_cur * RAND()) dig_cur, " "dig_curpay+FLOOR(dig_curpay * RAND()) dig_curpay, " "uc_invite+FLOOR(uc_invite * RAND()) uc_invite, " "check_ucfinc+FLOOR(check_ucfinc * RAND()) check_ucfinc, " "paypage+FLOOR(paypage * RAND()) paypage, " "charge_success+FLOOR(charge_success * RAND()) charge_success " "from ncf.ncf_360_h5_charge_overall_custom where tm >= '%s'" % earliest_date) client.closeMysql()
def consumer_record(infos): ''' 消费数据记录入库 :param infos: :return: ''' config = GetConfigure() group_id = config.get("kafka", "group_id") client = MysqlClient("saas_server") con, cur = client.connection # sql_format = "REPLACE INTO saas_server.custom_record(group_id, topic, tm, record_num, update_tm, update_num) VALUES (%s, %s, %s, %s, %s, %s)" sql_format = "INSERT INTO saas_server.custom_record(group_id, topic, tm, record_num, update_tm) VALUES (%s, %s, %s, %s, %s)" for topic in infos: for timestamp in infos[topic]: tm = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) update_tm = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) record_num = infos[topic][timestamp]["record_num"] cur.execute(sql_format, (group_id, topic, tm, record_num, update_tm)) con.commit() client.closeMysql()
class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_page(self, url, charset): response = requests.get(url, headers=header) response.encoding = charset return response.text def crawl_ip(self, page_num=3): """ 获取代理 ip3366 :param page_num: :return: """ verify_result = 0 response_time = time.strftime('%Y-%m-%d', time.localtime(time.time())) proxy = [] start_url = 'https://www.kuaidaili.com/free/inha/{}/' urls = [start_url.format(page) for page in range(1, page_num + 1)] for url in urls: print('crawl:', url) html = self.get_page(url, 'gb2312') if html: d = PyQuery(html) trs = d('table tbody tr').items() for tr in trs: scheme = tr.find('td:nth-child(4)').text().lower() ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() print(scheme, ip, port) #print(response_time) proxy = { "scheme": scheme, "ip": ip, "port": port, "status": verify_result, "response_time": response_time, } self.mysql.add_proxy(proxy) '''
class VerifyProxy(object): def __init__(self): self.mysql = MysqlClient() def verify_proxy(self,scheme,ip,port,Anonymous_degrees): proxies = { scheme : scheme + '://' + ip + ':' + port } response_time = 0 status = '0' try: response = requests.get(scheme + '://httpbin.org/ip',proxies=proxies,timeout=5) if response.status_code==200: response_time = round(response.elapsed.total_seconds()*1000) status = '1' print(f'{proxies}能使用!') else: response_time = 0 status = '0' print(f'{proxies}不能使用!') except: pass return {"response_time":response_time,"status":status} def verify_all(self): results = self.mysql.find_all() for result in results: res = self.verify_proxy(result[1],result[2],result[3],result[4]) proxy = { "id":result[0], "scheme":result[1], "ip":result[2], "port":result[3], "Anonymous_degrees":result[4], "status":res["status"], "response_time":res["response_time"] } self.mysql.update_proxy(proxy) print('检查完毕..')
def update_d_appkey(): client = MysqlClient("saas_server") con, cur = client.connection # sql_1 = '''SELECT a.appkey, a.plat, b.cdkey, a.enable*b.enable FROM (SELECT * FROM saas_meta.d_app) a LEFT JOIN (SELECT * FROM saas_meta.d_account) b on a.own = b.name_uid''' # sql_2 = '''SELECT a.appkey, a.plat, b.cdkey, b.enable FROM (SELECT * FROM customize.d_app) a LEFT JOIN (SELECT * FROM customize.d_account) b on a.own = b.name_uid''' # 合并 d_appkey # saas_server.d_appkey主键:appkey, plat, mongo_id, cdkey sql_replace = '''REPLACE INTO saas_server.d_appkey(inserttm, appkey, plat, mongo_id, cdkey, enable) \ SELECT NOW(), a.appkey, a.plat, \ CASE WHEN (c.mongo_id IS NULL OR c.mongo_id = '') THEN (SELECT MAX(id) FROM saas_server.d_mongo_server) ELSE c.mongo_id END AS mongo_id, b.cdkey, \ CASE WHEN c.enable IS NULL THEN a.enable*b.enable ELSE a.enable*b.enable*c.enable END as enable \ FROM (SELECT * FROM saas_meta.d_app WHERE enable = 1) a \ LEFT JOIN (SELECT * FROM saas_meta.d_account WHERE enable = 1) b on a.own = b.name_uid \ LEFT JOIN (SELECT * FROM saas_server.d_appkey) c ON a.appkey = c.appkey AND a.plat = c.plat AND c.cdkey = b.cdkey \ UNION \ SELECT NOW(), a.appkey, a.plat, \ CASE WHEN c.mongo_id IS NULL OR c.mongo_id = '' THEN (SELECT MAX(id) FROM saas_server.d_mongo_server) ELSE c.mongo_id END AS mongo_id, b.cdkey, \ CASE WHEN c.enable IS NULL THEN b.enable ELSE b.enable*c.enable END as enable \ FROM (SELECT * FROM customize.d_app) a \ LEFT JOIN (SELECT * FROM customize.d_account WHERE enable = 1) b on a.own = b.name_uid \ LEFT JOIN (SELECT * FROM saas_server.d_appkey) c ON a.appkey = c.appkey AND a.plat = c.plat AND c.cdkey = b.cdkey''' cur.execute(sql_replace) con.commit() client.closeMysql()
def __init__(self): self.mc = MysqlClient()
def __init__(self): self.mysql = MysqlClient()
def __init__(self, tableName, idItem, fixedItem, orderBy, maxRange): self.tableName = tableName self.searchItem = (idItem, fixedItem, orderBy) self.maxRange = maxRange self.mc = MysqlClient() self.prepare_environment()
def productor_miss_record(num=None, timestamp_begin=None, timestamp_end=None, tm_begin=None, tm_end=None, last=None): ''' 获取解析失败的的数据,下一步用于自动缺失检测重传 :param num: :param timestamp_begin: :param timestamp_end: :param tm_begin: :param tm_end: :param last: :return: ''' config = GetConfigure() product_id = config.get("kafka", "product_id") if not (num is None): _tm_begin = time.strftime( "%Y-%m-%d", time.localtime(time.time() - 86400 * num)) + " 00:00:00" _tm_end = time.strftime( "%Y-%m-%d", time.localtime(time.time() - 86400 * num)) + " 23:59:00" _tm_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time() - 60 * 10)) _tm_end = _tm_now if _tm_end > _tm_now else _tm_end elif not (timestamp_begin is None): _tm_begin = time.strftime("%Y-%m-%d", time.localtime(timestamp_begin)) if not (timestamp_end is None): _tm_end = time.strftime("%Y-%m-%d", time.localtime(timestamp_end)) elif not (last is None): _tm_end = time.strftime( "%Y-%m-%d", time.localtime(timestamp_begin + last * 60)) else: raise Exception("kwargs error!") elif not (tm_begin is None): _tm_begin = tm_begin if not (tm_end is None): _tm_end = tm_end elif not (last is None): _tm_end = ( datetime.datetime.strptime(tm_begin, "%Y-%m-%d %H:%M:%S") + datetime.timedelta(minutes=last)).strftime("%Y-%m-%d %H:%M:%S") else: raise Exception("kwargs error!") sql_format = "SELECT tm FROM saas_server.product_record WHERE (tm BETWEEN '%(_tm_begin)s' AND '%(_tm_end)s') AND product_id = '%(product_id)s' GROUP BY tm ORDER BY tm DESC" sql = sql_format % { "_tm_begin": _tm_begin, "_tm_end": _tm_end, "product_id": product_id } client = MysqlClient("saas_server") con, cur = client.connection cur.execute(sql) tms = set() for item in cur.fetchall(): tms.add(item[0].strftime("%Y-%m-%d %H:%M") + ":00") tmp = list(tms) tmp.sort() print("mysql", tmp) return get_tms(_tm_begin, _tm_end) - tms
def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy()
# -*- coding: utf-8 -*- from MysqlClient import MysqlClient import datetime import time client = MysqlClient("saas_server") con, cur = client.connection sql_format = "INSERT INTO saas_server.product_record (topic, tm, offset) VALUES (%s, %s, %s)" topic = "test" tm = datetime.datetime.fromtimestamp(time.time()) offset = 666 cur.execute(sql_format, (topic, tm, offset)) con.commit() client.closeMysql()