def merge(start, end): sql = ( "SELECT app_name, sector, sub_sector, os, active_users, daily_active_user," " total_use_time, starting_date FROM app WHERE source = 'QuestMobile' AND data_type='APP'" " AND starting_date >= '%s' AND starting_date <= '%s'") % (start, end) data = query(sql) # key - [app_name, sector, sub_sector, staring_date, mau, dau, time] data_dict = {} for row in data: app_name, sector, sub_sector, active_users, daily_active_user, total_use_time, \ starting_date = row["app_name"], row["sector"], row["sub_sector"], \ row["active_users"], row["daily_active_user"], row["total_use_time"], \ row["starting_date"] key = app_name + str(starting_date) if key in data_dict: data_dict[key][4] += active_users data_dict[key][5] += daily_active_user data_dict[key][6] += total_use_time else: data_dict[key] = [ app_name, sector, sub_sector, starting_date, active_users, daily_active_user, total_use_time] insert_sql = ( "INSERT INTO questmobile_merge(app_name, sector, sub_sector, starting_date," " mau, dau, time) VALUES(%s, %s, %s, %s, %s, %s, %s)") insert_batch(insert_sql, list(data_dict.values()))
def main(fpath, createddate, pattern=None): sql = ( "INSERT INTO soufang(title, serial_num, AREA, release_date, is_individual, city, " " url, picture_num, Tag, capture_datetime, createddate) " " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for filename in list_files(fpath, pattern): with open(filename, "r", encoding="utf8") as a_file: # 跳过文件头 next(a_file) insert_list, deduplicate_set = [], set() for line in a_file: fields = line.split(",") # 过滤 if filters(fields): fields.append(createddate) url = fields[6] # 开始除重 if url in deduplicate_set: continue else: insert_list.append(fields) deduplicate_set.add(url) # 导入数据库 print("总共有:" + str(len(insert_list)) + "条记录") insert_batch(sql, insert_list)
def gj_haoche(path, pattern=None): sql = ("INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker," " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)") for file_name in list_files(path, pattern): with open(file_name, "r", encoding="utf8") as a_file: print(file_name + " starts to insert...") insert_list, row_count = [], 0 for line in a_file: fields = line.strip().split(",") if len(fields) == 12: # 计数器加1 row_count += 1 # 开始取值 carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None city, title, mileage, price, newcarprice, code, checker, \ checkdate, url, createdAt, updatedAt = fields[0], fields[1], \ fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9], \ fields[10], fields[10] insert_list.append([city, title, carddate, mileage, price, newcarprice, code, \ checker, checkdate, url, createdAt, updatedAt]) # 如果超过100000行的话,重置计数器 if row_count >= 100000: insert_batch(sql, insert_list) print("100000 inserted...") insert_list, row_count = [], 0 # 把最后没有到100000条的记录插入数据库 insert_batch(sql, insert_list) print("Last ", len(insert_list), "inserted...")
def main(path, pattern=None): sql = ( "INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker," " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)") for file_name in list_files(path, pattern): with open(file_name, "r", encoding="utf8") as a_file: # 日期 match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name) if match: updatedAt = match.group(0) # 跳过表头 next(a_file) insert_list = [] for line in a_file: fields = line.strip().split(",") if len(fields) == 12: # 开始取值 carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None city, title, mileage, price, newcarprice, code, checker, \ checkdate, url = fields[0], fields[1], \ fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], \ fields[9] # 过滤无效数据 if filters(url, code, checkdate): insert_list.append([ city, title, carddate, mileage, price, newcarprice, code, checker, checkdate, url, updatedAt, updatedAt]) # 过滤完成,导入数据库 insert_batch(sql, insert_list)
def kuaidi(path, pattern=None): sql = ( "INSERT INTO kuaidi(capture_dtm, city, longitude_request, latitude_request, " " longitude_car, latitude_car, driver_id, driver_type, car_type, create_dt, flag_dt)" " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for file_name in list_files(path, pattern): with open(file_name, "r", encoding="utf8") as a_file: print(file_name + " starts to insert....") insert_list, row_count = [], 0 for line in a_file: fields = line.strip().split(",") if len(fields) == 10: # 计数器加1 row_count += 1 # 开始取值 capture_dtm, city, longitude_request, latitude_request, longitude_car, \ latitude_car, driver_id, driver_type, car_type = fields[0], fields[2], \ fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9] city = fields[2][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[2][2:]) insert_list.append([ capture_dtm, city, longitude_request, latitude_request, longitude_car, latitude_car, driver_id, driver_type, car_type, TODAY, capture_dtm.split(" ")[0]]) # 如果超过100000行的话,重置计数器 if row_count > 100000: insert_batch(sql, insert_list) insert_list, row_count = [], 0 print("100000 inserted...") # 把最后没有到100000条的记录插入数据库 insert_batch(sql, insert_list) print("Last ", len(insert_list), "inserted...")
def local_ganji(fpath, pattern=None): for filename in list_files(fpath, pattern): sql = ( "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," " %s, %s, %s, %s)") # 正则匹配 regex = re.compile("\('([^\)]+)\)[,;]+") # Map - 除重 a_dict = {} with open(filename, "r", encoding="utf8") as a_file: print(filename + " starts to insert...", flush=True) for line in a_file: # 匹配表名 if line.startswith("LOCK TABLES"): table = re.search(r"`(.+)`", line).group(0).strip("`") sql = sql.format(table=table) # 匹配数据 if line.startswith("INSERT INTO"): match = regex.findall(line) for row in match: fields = row.replace("'", "").split(",") # 舍弃正则解析出错的记录(数量很少) # if len(fields) != 14: # print(fields, flush=True) # continue # 从11月4号开始添加一个字段 # 舍弃tt == 1的记录(大幅度减小记录数) if fields[9] == "1": continue if len(fields) == 14: fields.insert(11, None) # mid - fields[0], createdDate - fields[13] key = fields[0] + "-" + fields[14] + "-" + fields[9] if key not in a_dict: a_dict[key] = fields else: continue elif len(fields) == 15: # puid - fields[11], createdDate - fields[14] key = fields[11] + "-" + fields[14] + "-" + fields[9] if key not in a_dict: a_dict[key] = fields else: continue # 舍弃tt == 1的记录(大幅度减小记录数) # if fields[9] == "1": # continue # else: # # mid - fields[0], createdDate - fields[13] # key = fields[0] + "-" + fields[13] + "-" + fields[9] # if key not in a_dict: # a_dict[key] = fields # else: # continue insert_batch(sql, list(a_dict.values())) print("Last ", len(a_dict), " inserted...", flush=True)
def ershoufang(fpath, pattern=None): for filename in list_files(fpath, pattern): sql = ( "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," " %s, %s, %s, %s)") # 正则匹配 regex = re.compile("\(([^\)]+)\)[,;]+") # Map - 除重 a_dict = {} with open(filename, "r", encoding="utf8") as a_file: print(filename + " starts to insert...", flush=True) for line in a_file: # 匹配表名 if line.startswith("LOCK TABLES"): table = re.search(r"`(.+)`", line).group(0).strip("`") sql = sql.format(table=table) print(sql) # 匹配数据 if line.startswith("INSERT INTO"): match = regex.findall(line) for row in match: fields = row.replace("'", "").split(",") # 舍弃正则解析出错的记录(数量很少) if len(fields) != 15: print(fields, flush=True) continue # 舍弃tt == 1的记录(大幅度减小记录数) if fields[9] == "1": continue else: # 如果puid为NULL if fields[11] == "NULL": match = re.search("&entinfo=(\d+)", fields[8]) if match: puid = re.search( "&entinfo=(\d+)", fields[8]).group().replace("&entinfo=", "") else: print("puid和URL都为空") continue else: puid = fields[11] # puid - fields[11], createdDate - fields[14] key = puid + "-" + fields[14] + "-" + fields[9] if key not in a_dict: a_dict[key] = fields else: continue insert_batch(sql, list(a_dict.values())) print("Last ", len(a_dict), " inserted...", flush=True)
def wuba_job(fpath, pattern=None): for filename in list_files(fpath, pattern): sql = ( "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," " %s, %s, %s, %s)") # 正则匹配 regex = re.compile("\(([^\)]+)\)[,;]+") merchants_ids_set = get_merchant_ids() print(len(merchants_ids_set)) # Map - 除重 a_dict = {} with open(filename, "r", encoding="utf8") as a_file: print(filename + " starts to insert...", flush=True) # 匹配数据 insert_list, line_count = [], 0 for line in a_file: # 匹配表名 if line.startswith("LOCK TABLES"): table = re.search(r"`(.+)`", line).group(0).strip("`") sql = sql.format(table=table) print(sql) if line.startswith("INSERT INTO"): # 计数器加1 line_count += 1 match = regex.findall(line) for row in match: fields = row.replace("'", "").split(",") # if fields[9] == "1": # continue # 过滤Merchant Ids if fields[0] in merchants_ids_set and fields[9] == "1": continue # 舍弃正则解析出错的记录(数量很少) if len(fields) != 15: print(fields, flush=True) continue # puid - fields[11], createdDate - fields[14] key = fields[11] + "-" + fields[14] + "-" + fields[9] if key not in a_dict: a_dict[key] = fields else: continue insert_list.append(fields) if line_count >= 3000: print("To insert " + str(len(insert_list)) + " records") insert_batch(sql, insert_list) insert_list, line_count = [], 0 insert_batch(sql, insert_list) print("Last ", len(insert_list), " inserted...", flush=True)
def main(fpath, pattern=None): for filename in list_files(fpath, pattern): sql = ( "INSERT INTO cheyipai_b2b(auc_Id, car_Id, brand, manufacturer, model, final_offer," " winner_id, winner_id_nonlocal, mileage, reg_dt, reg_area,reg_area_total, " " car_source_id, root_id, root_name, capture_dtm)" " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") with open(filename, "r", encoding="utf8") as a_file: print(filename) next(a_file) insert_list = [] for line in a_file: fields = line.split(",") insert_list.append(fields) insert_batch(sql, insert_list)
def main(fpath, capture_dt, pattern=None): sql = "INSERT INTO iwjw(url, city, NAME, pics, video, capture_dt) " " VALUES(%s, %s, %s, %s, %s, %s)" for filename in list_files(fpath, pattern): with open(filename, "r", encoding="utf8") as a_file: # 跳过文件头 next(a_file) insert_list, key_set = [], set() for line in a_file: fields = line.split(",")[:5] if fields[0] not in key_set: fields[1] = CITY_DICT.get(fields[1]) fields.append(capture_dt) insert_list.append(fields) key_set.add(fields[0]) insert_batch(sql, insert_list)
def new_version(path, table_name, pattern=None): new_sql = ( "INSERT INTO " + table_name + "(driver_id, city, status, driver_type, flag_dt) values(%s, %s, %s, %s, %s)") for file_name in list_files(path, pattern): # 日期 match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name) if match: starting_date = match.group(0) with open(file_name, "r", encoding="utf8") as a_file: print(file_name + " starts to insert....", flush=True) insert_list = [] for line in a_file: fields = line.strip().split(",") if len(fields) == 4: fields[1] = fields[1][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[1][2:]) fields.append(starting_date) insert_list.append(fields) insert_batch(new_sql, insert_list)
def main(fpath, capture_dt, pattern=None): sql = ( "INSERT INTO lianjia(id, url, title, pics, is_exclusive, community, house_type, AREA, " " orientation, price, down_payment, monthly_payment, deals_done, agent_comments, " " customer_visits, route, city, capture_dt) " " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for filename in list_files(fpath, pattern): with open(filename, "r", encoding="utf8") as a_file: # 跳过文件头 next(a_file) insert_list = [] for line in a_file: fields = line.split(",") if len(fields) == FIELDS_LEN: # 从URL中读取城市信息 city = CITYS.get(fields[1][7:9]) fields.extend([city, capture_dt]) insert_list.append(fields) print("总共有:" + str(len(insert_list)) + "条记录") insert_batch(sql, insert_list)
def insert_ayi(a_path, pattern=None): # 读取已存在的阿姨 existing_ayi = query(AYI_SELECT_SQL) ayi_dict = {} for row in existing_ayi: ayi_dict[row["uid"]] = row["entrytime"] for file_name in list_files(a_path, pattern): logging.debug("读取文件" + file_name) with open(file_name, "r", encoding="utf8") as a_file: # 跳过Header next(a_file) new_ayis, new_details = [], [] for line in a_file: fields = line.strip().split(",") if len(fields) == FIELDS_LEN: address, age, comment, commentcountofthismonth, distance, distancewithunit, \ entrytime, goodrate, idcard, mobile, name, pic, province, servicecount, \ servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \ capturedate, appointmentdate = fields # String -> Date capturedate = datetime.strptime(capturedate, "%Y-%m-%d").date() # 新阿姨根据单量推算entrytime if uid not in ayi_dict: entrytime = capturedate - timedelta(days=int(servicecount)) new_ayis.append([ uid, province, sex, city, name, mobile, str(entrytime), idcard, age, pic]) ayi_dict[uid] = entrytime # 获取最新的entrytime entrytime = ayi_dict[uid] new_details.append([ address, age, comment, commentcountofthismonth, distance, distancewithunit, entrytime, goodrate, idcard, mobile, name, pic, province, servicecount, servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, capturedate, appointmentdate]) logging.debug(" 今日新阿姨数量: " + str(len(new_ayis))) insert_batch(AYI_INSERT_SQL, new_ayis) insert_batch(DETAILS_INSERT_SQL, new_details)
def read_sql_file(path): for filename in list_files(path, "*.sql"): sql = ("INSERT INTO {table}(cate1, cate2, cate3, cate4, cate5, cate6, cate7, time, plat," " name, sales, amount, deals) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" "ON DUPLICATE KEY UPDATE sales=VALUES(sales), amount=VALUES(amount), deals=VALUES(deals)") regex = re.compile("\(((\'[^\']+\',){10}\d+,\d+,\d+)\)") # 正则匹配 with open(filename, "r", encoding="utf8") as a_file: for line in a_file: insert_list = [] if line.startswith("LOCK TABLES"): table = re.search(r"`(.+)`", line).group(0).strip("`") sql = sql.format(table=table) print(sql) if line.startswith("INSERT INTO"): match = regex.findall(line) for row in match: fields = row[0].replace("'", "").split(",") # if len(fields) == 13 and fields[7] == "201601": if len(fields) == 13: insert_list.append(fields) insert_batch(sql, insert_list) print(filename + " 更新完毕。。。")
def main(fpath, pattern=None): for filename in list_files(fpath, pattern): sql = ( "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," " %s, %s, %s, %s)") # 正则匹配 regex = re.compile("\(([^\)]+)\)[,;]+") with open(filename, "r", encoding="utf8") as a_file: print(filename + " starts to insert...", flush=True) # 匹配数据 insert_list, line_count = [], 0 for line in a_file: # 匹配表名 if line.startswith("LOCK TABLES"): table = re.search(r"`(.+)`", line).group(0).strip("`") sql = sql.format(table=table) print(sql) if line.startswith("INSERT INTO"): # 计数器加1 line_count += 1 match = regex.findall(line) for row in match: fields = row.replace("'", "").split(",") # 舍弃正则解析出错的记录(数量很少) if len(fields) != 15: print(fields, flush=True) continue if fields[9] == "1": continue insert_list.append(fields) if line_count >= 2000: print("To insert " + str(len(insert_list)) + " records") insert_batch(sql, insert_list) insert_list, line_count = [], 0 insert_batch(sql, insert_list) print("Last ", len(insert_list), " inserted...", flush=True)
def wuba_ayi(path): sql = ("INSERT INTO 58_ayi(address, age, comment, commentcountofthismonth, distance, " " distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, province, " " servicecount, servicecountofthismonth, sex, star, uid, valid, workage, worktime, " " city, capturedate, appointmentdate) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for file_name in list_files(path): with open(file_name, "r", encoding="utf8") as a_file: print(file_name + " starts to insert...") insert_list, row_count = [], 0 # 跳过Header next(a_file) for line in a_file: fields = line.strip().split(",") if len(fields) == 24: # 计数器加1 row_count += 1 # 开始取值 address, age, comment, commentcountofthismonth, distance, distancewithunit, \ entrytime, goodrate, idcard, mobile, NAME, pic, province, servicecount, \ servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \ capturedate, appointmentdate = fields[0], fields[1], fields[2], fields[3], \ fields[4], fields[5], fields[6], fields[7], fields[8] ,fields[9], fields[10], fields[11], \ fields[12], fields[13], fields[14], fields[15], fields[16], fields[17], \ fields[18], fields[19], fields[20], fields[21], fields[22], fields[23] insert_list.append([address, age, comment, commentcountofthismonth, distance, \ distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, \ province, servicecount, servicecountofthismonth, sex, star, uid, valid, \ workage, worktime, city, capturedate, appointmentdate]) # 如果超过100000行的话,重置计数器 if row_count >= 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 # 把最后没有到100000条的记录插入数据库 print("Last ", len(insert_list), "inserted...") insert_batch(sql, insert_list)
def didizhuanche(path, table_name, pattern=None): sql = ( "INSERT INTO " + table_name + "(driver_name, longitude, latitude, " " cnt_order, license, car_type, driver_id, capture_dtm, city, created_dt, flag_dt) values(" " %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for file_name in list_files(path, pattern): with open(file_name, "r", encoding="utf8") as a_file: print(file_name + " starts to insert....", flush=True) insert_list, row_count = [], 0 for line in a_file: fields = line.strip().split("\t") if len(fields) == 14: # 计数器加1 row_count += 1 # 开始取值 driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \ capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \ fields[10], fields[11], fields[12] city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:]) insert_list.append([ driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, capture_dtm, city, TODAY, capture_dtm.split(" ")[0]]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...", flush=True) insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif len(fields) == 27: # 计数器加1 row_count += 1 # 开始取值 driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \ capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \ fields[10], fields[11], fields[12] city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:]) insert_list.append([ driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, capture_dtm, city, TODAY, capture_dtm.split(" ")[0]]) driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \ capture_dtm = fields[14], fields[16], fields[17], fields[21], fields[22], \ fields[23], fields[24], fields[25] city = fields[26][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[26][2:]) insert_list.append([ driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, capture_dtm, city, TODAY, capture_dtm.split(" ")[0]]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...", flush=True) insert_batch(sql, insert_list) insert_list, row_count = [], 0 # 把最后没有到100000条的记录插入数据库 print("Last ", len(insert_list), "inserted...", flush=True) insert_batch(sql, insert_list)
def main(path, pattern=None): for filename in list_files(path, pattern): match = re.search(r"(\d{4}-\d{2})", filename) if match: starting_date = match.group(0) + "-01" all_lines = [] with open(filename, encoding="utf8") as a_csv: spam_reader = csv.reader(a_csv, delimiter=",") # 跳过第一行Title next(spam_reader) for row in spam_reader: line = [] # app_id line.append(row[8]) # app_name line.append(row[9]) # app_url line.append(row[13]) # app_iap line.append(row[14]) # region line.append(row[5]) # os line.append("iOS") # device line.append(row[3]) # Source line.append("Appannie") # store line.append("Apple") # sector line.append(row[1]) # sub_sector line.append(row[15]) # type line.append(row[4]) # rank line.append(row[0]) # starting_date line.append(starting_date) # date_type line.append("30") # download_or_revenue line.append(row[10]) # unit line.append(row[11]) # app_average_price_usd line.append(row[19]) # avg_rating_all line.append(row[27]) # app_version line.append(row[7]) # app_release_date line.append(row[20]) # publisher_id line.append(row[21]) # publisher_name line.append(row[22]) # company_name line.append(row[23]) # parent_company_name line.append(row[24]) all_lines.append(line) insert_sql = ( "INSERT INTO app_download(app_id, app_name, app_url, app_iap, region, os," " device, source, store, sector, sub_sector, type, rank, starting_date, date_type," " download_or_revenue, unit, app_average_price_usd, avg_rating_all, app_version, " " app_release_date, publisher_id, publisher_name, company_name, parent_company_name)" " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," " %s, %s, %s, %s, %s, %s)") insert_batch(insert_sql, all_lines) print(filename + "*************************Done.")
def main(new_date, old_date): old = fetch_data(old_date) new = fetch_data(new_date) print("*******************************MAU*************************************") mau_lines = [] for key, value in new.mau_dict.items(): old_mau_dict = old.mau_dict sector = new.sector_dict.get(key).split("-")[0] sub_sector = new.sector_dict.get(key).split("-")[1] if key in old_mau_dict: old_value = old_mau_dict.get(key) margin = value - old_value if margin > 0 and old_value > 0: percent = (value - old_value) / old_value print("%s\t%s\t%s\t%s\t%s\t%s" % ( key, sector, sub_sector, percent, value, old_value)) line_set = [ key, sector, sub_sector, "MAU", percent, value, old_value, new_date, "QuestMobile"] mau_lines.append(line_set) # 二维Array,按照Array内部的Percent排序,从大到小,并且取前300个 # mau_lines = sorted(mau_lines, key=lambda l: l[4], reverse=True)[0:300] mau_lines = sorted(mau_lines, key=lambda l: l[4], reverse=True) print("*******************************DAU*************************************") dau_lines = [] for key, value in new.dau_dict.items(): old_dau_dict = old.dau_dict sector = new.sector_dict.get(key).split("-")[0] sub_sector = new.sector_dict.get(key).split("-")[1] if key in old_dau_dict: old_value = old_dau_dict.get(key) margin = value - old_value if margin > 0: percent = (value - old_value) / old_value print("%s\t%s\t%s\t%s\t%s\t%s" % ( key, sector, sub_sector, percent, value, old_value)) line_set = [ key, sector, sub_sector, "DAU", percent, value, old_value, new_date, "QuestMobile"] dau_lines.append(line_set) # dau_lines = sorted(dau_lines, key=lambda l: l[4], reverse=True)[0:300] dau_lines = sorted(dau_lines, key=lambda l: l[4], reverse=True) print("*******************************TIME*************************************") time_lines = [] for key, value in new.time_dict.items(): old_time_dict = old.time_dict sector = new.sector_dict.get(key).split("-")[0] sub_sector = new.sector_dict.get(key).split("-")[1] if key in old_time_dict: old_value = old_time_dict.get(key) margin = value - old_value if margin > 0 and old_value > 0: percent = (value - old_value) / old_value print("%s\t%s\t%s\t%s\t%s\t%s" % ( key, sector, sub_sector, percent, value, old_value)) line_set = [ key, sector, sub_sector, "Time", percent, value, old_value, new_date, "QuestMobile"] time_lines.append(line_set) # time_lines = sorted(time_lines, key=lambda l: l[4], reverse=True)[0:300] time_lines = sorted(time_lines, key=lambda l: l[4], reverse=True) # insert calculation into database insert_sql = ( "INSERT INTO top_growth_apps(app_name, sector, sub_sector, type, growth," " new_data, old_data, starting_date, source) values(%s, %s, %s, %s, %s, %s, %s, %s, %s)") insert_batch(insert_sql, mau_lines + time_lines + dau_lines)
def didi(path): sql = ("INSERT INTO didi(driver_id, driver_name, license, longitude, latitude, " " order_num, company, capture_dtm, city, flag_dt, createddate)" " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") for file_name in list_files(path): print(file_name) with open(file_name, "r", encoding="utf8") as a_file: insert_list, row_count = [], 0 for line in a_file: print(line) fields = line.split("\t") length = len(fields) if length == 10: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, \ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[9] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif length == 11: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, company,\ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[9], fields[10] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif length == 12: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, company,\ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[10], fields[11] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif length == 19: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, \ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[9] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) driver_id, driver_name, license, longitude, latitude, order_num, \ capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \ fields[14], fields[15], fields[16], fields[17], fields[18] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif length == 20: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, \ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[9] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) driver_id, driver_name, license, longitude, latitude, order_num, company, \ capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \ fields[14], fields[15], fields[16], fields[17], fields[18], fields[19] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 elif length == 21: # 计数器加1 row_count += 1 # 开始取值 driver_id, driver_name, license, longitude, latitude, order_num, company,\ capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \ fields[7], fields[8], fields[9], fields[10] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) driver_id, driver_name, license, longitude, latitude, order_num, company, \ capture_dtm, city = re.findall(r"\d+", fields[10])[0], fields[11], fields[12], \ fields[15], fields[16], fields[17], fields[18], fields[19], fields[20] insert_list.append([driver_id, driver_name, license, longitude, latitude, \ order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY]) # 如果超过100000行的话,重置计数器 if row_count > 100000: print("100000 inserted...") insert_batch(sql, insert_list) insert_list, row_count = [], 0 # 把最后没有到100000条的记录插入数据库 print("Last ", len(insert_list), "inserted...") insert_batch(sql, insert_list)