Exemplo n.º 1
0
def gj_haoche(path, pattern=None):
    sql = ("INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker,"
        " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert...")
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 12:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None
                    city, title, mileage, price, newcarprice, code, checker, \
                    checkdate, url, createdAt, updatedAt = fields[0], fields[1], \
                    fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9], \
                    fields[10], fields[10]
                    insert_list.append([city, title, carddate, mileage, price, newcarprice, code, \
                        checker, checkdate, url, createdAt, updatedAt])
                    # 如果超过100000行的话,重置计数器
                    if row_count >= 100000:
                        insert_batch(sql, insert_list)
                        print("100000 inserted...")
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), "inserted...")
Exemplo n.º 2
0
def main(path, pattern=None):
    sql = (
        "INSERT INTO Haoche(city,title,carddate,mileage,price,newcarprice,code,checker,"
        " checkdate,url,createdAt,updatedAt) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        " ON DUPLICATE KEY UPDATE updatedAt=VALUES(updatedAt), price=VALUES(price)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            # 日期
            match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name)
            if match:
                updatedAt = match.group(0)
            # 跳过表头
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 12:
                    # 开始取值
                    carddate = datetime.strptime(fields[2], "%Y-%m") if fields[2] else None
                    city, title, mileage, price, newcarprice, code, checker, \
                        checkdate, url = fields[0], fields[1], \
                        fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], \
                        fields[9]
                    # 过滤无效数据
                    if filters(url, code, checkdate):
                        insert_list.append([
                            city, title, carddate, mileage, price, newcarprice, code,
                            checker, checkdate, url, updatedAt, updatedAt])
            # 过滤完成,导入数据库
            insert_batch(sql, insert_list)
Exemplo n.º 3
0
def main(fpath, createddate, pattern=None):
    sql = (
        "INSERT INTO soufang(title, serial_num, AREA, release_date, is_individual, city, "
        " url, picture_num, Tag, capture_datetime, createddate) "
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list, deduplicate_set = [], set()
            for line in a_file:
                fields = line.split(",")
                # 过滤
                if filters(fields):
                    fields.append(createddate)
                    url = fields[6]
                    # 开始除重
                    if url in deduplicate_set:
                        continue
                    else:
                        insert_list.append(fields)
                        deduplicate_set.add(url)
            # 导入数据库
            print("总共有:" + str(len(insert_list)) + "条记录")
            insert_batch(sql, insert_list)
Exemplo n.º 4
0
def main(path, target):

    total_set = set()
    for fpath in list_files(path):
        file_name = os.path.basename(fpath)

        if file_name.startswith(('kuaiche', 'zhuanche')):
            with open(fpath, "r", encoding="utf8") as a_file:
                print("开始读取: " + fpath)
                file_set = set()
                for line in a_file:
                    fields = line.strip().split("\t")
                    if len(fields) == 14:
                        license, car_model, driver_id  = fields[9].strip(), fields[10].strip(), fields[11].strip()
                        city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:])
                        if city in CITIES:
                            key = driver_id + "|-|" + city + "|-|" + car_model + "|-|" + license
                            file_set.add(key)
                new_set = file_set - total_set
                total_set.update(new_set)
                print("至此有: " + str(len(total_set)))

                with open(target, "a", encoding="utf8") as b_file:
                    for key in new_set:
                        b_file.write(key + "\n")
Exemplo n.º 5
0
def kuaidi(path, pattern=None):
    sql = (
        "INSERT INTO kuaidi(capture_dtm, city, longitude_request, latitude_request, "
        " longitude_car, latitude_car, driver_id, driver_type, car_type, create_dt, flag_dt)"
        " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....")
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 10:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    capture_dtm, city, longitude_request, latitude_request, longitude_car, \
                        latitude_car, driver_id, driver_type, car_type = fields[0], fields[2], \
                        fields[3], fields[4], fields[5], fields[6], fields[7], fields[8], fields[9]
                    city = fields[2][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[2][2:])
                    insert_list.append([
                        capture_dtm, city, longitude_request, latitude_request, longitude_car,
                        latitude_car, driver_id, driver_type, car_type, TODAY,
                        capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                        print("100000 inserted...")
            # 把最后没有到100000条的记录插入数据库
            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), "inserted...")
Exemplo n.º 6
0
def main(fpath, pattern=None, seperator="\t"):
    for file_name in list_files(fpath, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            next(a_file)
            for line in a_file:
                fields = line.split(seperator)
                print(fields)
Exemplo n.º 7
0
def merge_reader(path):
    for filename in list_files(path, "*.csv"):
        with open("D:/Appannie_ios_201512.txt", "a", encoding="utf8") as target:
            with open(filename, "r", encoding="utf8") as a_file:
                for line in a_file:
                    fields = line.split(",")
                    if fields[5] == "China":
                        target.write(line)
Exemplo n.º 8
0
def local_ganji(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\('([^\)]+)\)[,;]+")
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                # 匹配数据
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        # if len(fields) != 14:
                        #     print(fields, flush=True)
                        #     continue

                        # 从11月4号开始添加一个字段
                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        if fields[9] == "1":
                                continue
                        if len(fields) == 14:
                            fields.insert(11, None)
                            # mid - fields[0], createdDate - fields[13]
                            key = fields[0] + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue
                        elif len(fields) == 15:
                            # puid - fields[11], createdDate - fields[14]
                            key = fields[11] + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue

                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        # if fields[9] == "1":
                        #     continue
                        # else:
                        #     # mid - fields[0], createdDate - fields[13]
                        #     key = fields[0] + "-" + fields[13] + "-" + fields[9]
                        #     if key not in a_dict:
                        #         a_dict[key] = fields
                        #     else:
                        #         continue
            insert_batch(sql, list(a_dict.values()))
            print("Last ", len(a_dict), " inserted...", flush=True)
def didizhuanche(path, table_name, pattern=None):
    sql = (
        "INSERT INTO " + table_name + "(driver_name, longitude, latitude, "
        " cnt_order, license, car_type, driver_id, capture_dtm, city, created_dt, flag_dt) values("
        " %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....", flush=True)
            insert_list, row_count = [], 0
            for line in a_file:
                fields = line.strip().split("\t")
                if len(fields) == 14:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \
                        fields[10], fields[11], fields[12]
                    city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license,
                        car_type, driver_id, capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...", flush=True)
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif len(fields) == 27:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[1], fields[3], fields[4], fields[8], fields[9], \
                        fields[10], fields[11], fields[12]
                    city = fields[13][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[13][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license, car_type, driver_id,
                        capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])

                    driver_name, longitude, latitude, cnt_order, license, car_type, driver_id, \
                        capture_dtm = fields[14], fields[16], fields[17], fields[21], fields[22], \
                        fields[23], fields[24], fields[25]
                    city = fields[26][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[26][2:])
                    insert_list.append([
                        driver_name, longitude, latitude, cnt_order, license, car_type, driver_id,
                        capture_dtm, city, TODAY, capture_dtm.split(" ")[0]])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...", flush=True)
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...", flush=True)
            insert_batch(sql, insert_list)
Exemplo n.º 10
0
def load_files():
    """Read daily baomu files to memory"""
    global user_dict
    for filename in list_files(fpath, pattern):
        print(filename)
        with open(filename, 'r', encoding='utf8') as a_file:
            # 跳过文件头
            next(a_file)
            for line in a_file:
                bm_type, city, sid, name = line.strip().split(',')[:4]
                capture_dt = re.search(r"(\d{4}-\d{2})-\d{2}", filename).group(0)
                user_dict[sid] = [bm_type, city, sid, name, capture_dt]
Exemplo n.º 11
0
def main(fpath, pattern=None):
    with open("D:/ITjuzi_investments_final.txt", "w", encoding="utf8") as a_target:
        for filename in list_files(fpath, pattern):
            with open(filename, "r", encoding="utf8") as a_file:
                for line in a_file:
                    if line.startswith("http"):
                        line_with_http = line
                    else:
                        line_with_http = line_with_http.strip() + " " + line
                        # a_target.write(line)
                    print(line_with_http, end="")
                    a_target.write(line_with_http)
Exemplo n.º 12
0
def check_field_nums(fpath, pattern=None):
    field_dict = {}
    for file_name in list_files(fpath, pattern):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name)
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) in field_dict.keys():
                    field_dict[len(fields)] += 1
                else:
                    print(line)
                    field_dict[len(fields)] = 0
    print(field_dict)
Exemplo n.º 13
0
def rm_zips(backup_type):

	date_set = set()
	for file_name in list_files(BACKUP_PATH, "*.zip"):
		match = re.search("\d{4}-\d{2}-\d{2}", file_name)
		if match:
			date_set.add(match.group())
	max_date = max(date_set)
	rm_file = max_date + "_" + backup_type + ".zip"
	rm_cmd = "cd {2}; ls | grep -v '{0}' | grep '{1}' | xargs rm".format(max_date, backup_type,
		BACKUP_PATH)
	if os.system(rm_cmd) == 0:
		print("历史数据清除成功!", flush=True)
Exemplo n.º 14
0
def file_to_db(path, pattern=None):
    # 查询已经存在的司机
    driver_data = query(DRIVER_SELECT_SQL)
    driver_set = set(map(lambda row: row["driver_id"], driver_data))
    # 查询本月已经存在的司机
    driver_month_data = query(DRIVER_MONTH_SELECT_SQL % (CURRENT_MONTH))
    driver_month_set = set(map(lambda row: row["driver_id"], driver_month_data))

    for file_name in list_files(path, pattern):
        print(file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            detail_dict, driver_dict, retention_dict = {}, {}, {}
            for line in a_file:
                fields = line.strip().split("\t")
                length = len(fields)

                if 9 < len(fields) < 13:
                    # 读取字段值,格式转换
                    driver_id, driver_name, license, photo_url, level, longitude, latitude, \
                        order_num_str = fields[:8]
                    capture_dtm, city = fields[-2:]
                    order_num = int(order_num_str)
                    flag_dt = capture_dtm.split(" ")[0]
                    city = city[:2] + re.sub(CITY_RE, "", city[2:])

                    # 确定新司机
                    if driver_id not in driver_set:
                        driver_dict[driver_id] = [driver_id, driver_name, license, photo_url, level, flag_dt]
                        driver_set.add(driver_id)
                    # 确定本月出现司机
                    # if driver_id not in retention_dict:
                    #     retention_dict[driver_id] = [driver_id, city, CURRENT_MONTH]
                    if driver_id not in driver_month_set:
                        retention_dict[driver_id] = [driver_id, city, CURRENT_MONTH]
                        driver_month_set.add(driver_id)
                    # 详细
                    if driver_id in detail_dict:
                        detail_dict[driver_id][2] = min(detail_dict[driver_id][2], order_num)
                        detail_dict[driver_id][3] = max(detail_dict[driver_id][3], order_num)
                        detail_dict[driver_id][4] = min(detail_dict[driver_id][4], capture_dtm)
                        detail_dict[driver_id][5] = max(detail_dict[driver_id][5], capture_dtm)
                    else:
                        detail_dict[driver_id] = [driver_id, city, order_num, order_num,
                            capture_dtm, capture_dtm, flag_dt]

            upsert_batch(NEW_DRIVER_SQL, list(driver_dict.values()))
            print("导入: " + str(len(driver_dict)) + " 个新司机")
            upsert_batch(NEW_RETENTION_SQL, list(retention_dict.values()))
            print("导入: " + str(len(retention_dict)) + " 个本月出现司机")
            upsert_batch(NEW_DETAIL_SQL, list(detail_dict.values()))
            print("导入: " + str(len(detail_dict)) + " details信息")
Exemplo n.º 15
0
def ershoufang(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                # 匹配数据
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        # 舍弃tt == 1的记录(大幅度减小记录数)
                        if fields[9] == "1":
                            continue
                        else:
                            # 如果puid为NULL
                            if fields[11] == "NULL":
                                match = re.search("&entinfo=(\d+)", fields[8])
                                if match:
                                    puid = re.search(
                                        "&entinfo=(\d+)",
                                        fields[8]).group().replace("&entinfo=", "")
                                else:
                                    print("puid和URL都为空")
                                    continue
                            else:
                                puid = fields[11]
                            # puid - fields[11], createdDate - fields[14]
                            key = puid + "-" + fields[14] + "-" + fields[9]
                            if key not in a_dict:
                                a_dict[key] = fields
                            else:
                                continue
            insert_batch(sql, list(a_dict.values()))
            print("Last ", len(a_dict), " inserted...", flush=True)
Exemplo n.º 16
0
def wuba_job(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        merchants_ids_set = get_merchant_ids()
        print(len(merchants_ids_set))
        # Map - 除重
        a_dict = {}
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            # 匹配数据
            insert_list, line_count = [], 0
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    # 计数器加1
                    line_count += 1
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # if fields[9] == "1":
                        #     continue
                        # 过滤Merchant Ids
                        if fields[0] in merchants_ids_set and fields[9] == "1":
                            continue
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        # puid - fields[11], createdDate - fields[14]
                        key = fields[11] + "-" + fields[14] + "-" + fields[9]
                        if key not in a_dict:
                            a_dict[key] = fields
                        else:
                            continue
                        insert_list.append(fields)
                    if line_count >= 3000:
                        print("To insert " + str(len(insert_list)) + " records")
                        insert_batch(sql, insert_list)
                        insert_list, line_count = [], 0

            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), " inserted...", flush=True)
Exemplo n.º 17
0
def main(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO cheyipai_b2b(auc_Id, car_Id, brand, manufacturer, model, final_offer,"
            " winner_id, winner_id_nonlocal, mileage, reg_dt, reg_area,reg_area_total, "
            " car_source_id, root_id, root_name, capture_dtm)"
            " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename)
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.split(",")
                insert_list.append(fields)
            insert_batch(sql, insert_list)
Exemplo n.º 18
0
def main(fpath, capture_dt, pattern=None):
    sql = "INSERT INTO iwjw(url, city, NAME, pics, video, capture_dt) " " VALUES(%s, %s, %s, %s, %s, %s)"

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list, key_set = [], set()
            for line in a_file:
                fields = line.split(",")[:5]
                if fields[0] not in key_set:
                    fields[1] = CITY_DICT.get(fields[1])
                    fields.append(capture_dt)
                    insert_list.append(fields)
                    key_set.add(fields[0])
            insert_batch(sql, insert_list)
Exemplo n.º 19
0
def main(fpath, pattern=None):
    final_set = set()
    car_set = set()
    for file_name in list_files(fpath, pattern):
        print("开始读取: " + file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            for line in a_file:
                fields = line.strip().split("|-|")
                car_set.add(fields[2])
                final_set.add(line.strip())
    print("总共有: " + str(len(final_set)))

    with open("D:/didi_final.txt", "a", encoding="utf8") as b_file:
        for car in final_set:
            b_file.write(car + "\n")

    for car in car_set:
        print(car)
Exemplo n.º 20
0
def main(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        merchants_ids_set = get_merchant_ids()
        remove_set = set()
        result_set = set()

        with open(filename, "r", encoding="utf8") as a_file:
            result_name = re.search(r"(\d{4}-\d{2}-\d{2})", filename).group(0)
            print(filename + " starts to check...", flush=True)
            # 第一次循环
            for line in a_file:
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        if fields[9] != "1":
                            remove_set.add(fields[0])
            print("第一遍循环结束。。。")
            a_file.seek(0)
            # 第二次循环
            for line in a_file:
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        if fields[0] in merchants_ids_set and fields[0] not in remove_set:
                            result_set.add(fields[0])
            print("第二遍循环结束。。。")
            print("结果集大小: " + str(len(result_set)))

        with open("/home/bda/data/58ganji/merchants_ids/" + result_name + ".csv", "a", encoding="utf8") as writer:
            for mid in result_set:
                writer.write(mid + "\n")
        print("今日生成循环结束。。。")
Exemplo n.º 21
0
def new_version(path, table_name, pattern=None):
    new_sql = (
        "INSERT INTO " + table_name +
        "(driver_id, city, status, driver_type, flag_dt) values(%s, %s, %s, %s, %s)")
    for file_name in list_files(path, pattern):
        # 日期
        match = re.search(r"(\d{4}-\d{2}-\d{2})", file_name)
        if match:
            starting_date = match.group(0)
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert....", flush=True)
            insert_list = []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == 4:
                    fields[1] = fields[1][:2] + re.sub(r"\d|,|市|南|北|上|下|\r|\n", "", fields[1][2:])
                    fields.append(starting_date)
                    insert_list.append(fields)
            insert_batch(new_sql, insert_list)
Exemplo n.º 22
0
def main(fpath, capture_dt, pattern=None):
    sql = (
        "INSERT INTO lianjia(id, url, title, pics, is_exclusive, community, house_type, AREA, "
        " orientation, price, down_payment, monthly_payment, deals_done, agent_comments, "
        " customer_visits, route, city, capture_dt) "
        " VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for filename in list_files(fpath, pattern):
        with open(filename, "r", encoding="utf8") as a_file:
            # 跳过文件头
            next(a_file)
            insert_list = []
            for line in a_file:
                fields = line.split(",")
                if len(fields) == FIELDS_LEN:
                    # 从URL中读取城市信息
                    city = CITYS.get(fields[1][7:9])
                    fields.extend([city, capture_dt])
                    insert_list.append(fields)
            print("总共有:" + str(len(insert_list)) + "条记录")
            insert_batch(sql, insert_list)
Exemplo n.º 23
0
def insert_ayi(a_path, pattern=None):
    # 读取已存在的阿姨
    existing_ayi = query(AYI_SELECT_SQL)

    ayi_dict = {}
    for row in existing_ayi:
        ayi_dict[row["uid"]] = row["entrytime"]

    for file_name in list_files(a_path, pattern):
        logging.debug("读取文件" + file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            # 跳过Header
            next(a_file)
            new_ayis, new_details = [], []
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields) == FIELDS_LEN:
                    address, age, comment, commentcountofthismonth, distance, distancewithunit, \
                        entrytime, goodrate, idcard, mobile, name, pic, province, servicecount, \
                        servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \
                        capturedate, appointmentdate = fields
                    # String -> Date
                    capturedate = datetime.strptime(capturedate, "%Y-%m-%d").date()
                    # 新阿姨根据单量推算entrytime
                    if uid not in ayi_dict:
                        entrytime = capturedate - timedelta(days=int(servicecount))
                        new_ayis.append([
                            uid, province, sex, city, name, mobile, str(entrytime), idcard, age, pic])
                        ayi_dict[uid] = entrytime
                    # 获取最新的entrytime
                    entrytime = ayi_dict[uid]
                    new_details.append([
                        address, age, comment, commentcountofthismonth, distance,
                        distancewithunit, entrytime, goodrate, idcard, mobile, name, pic,
                        province, servicecount, servicecountofthismonth, sex, star, uid, valid,
                        workage, worktime, city, capturedate, appointmentdate])

            logging.debug(" 今日新阿姨数量: " + str(len(new_ayis)))
            insert_batch(AYI_INSERT_SQL, new_ayis)
            insert_batch(DETAILS_INSERT_SQL, new_details)
Exemplo n.º 24
0
def yxp_update(fpath):
    # 把整个数据库表装载进内存
    sql = ("SELECT url, is_sellable, price, update_dt, sold_dt FROM youxinpai2")
    data = query(sql)

    db_dict = {}
    for row in data:
        url, is_sellable, price, update_dt, sold_dt = row["url"], row["is_sellable"], row["price"], \
            row["update_dt"], row["sold_dt"]
        db_dict[url] = [is_sellable, price, update_dt, sold_dt]

    # 遍历list文件更新内存中的数据
    for file_name in list_files(fpath):
        print(file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            for line in a_file:
                fields = line.split("\t")
                if len(fields) == 11:
                    url, price, is_sellable, capture_dt = fields[0], fields[8], fields[9], \
                        fields[10].split(" ")[0]
                    if url in db_dict:
                        # "未出售" --> "已出售"
                        if db_dict[url][0] != "已出售" and is_sellable == "已出售":
                            db_dict[url] = ["已出售", price, capture_dt, capture_dt]
                        # None or "未出售" --> None or "未出售"
                        elif is_sellable != "已出售":
                            db_dict[url][1] = price
                            db_dict[url][2] = capture_dt
    # 更新数据库
    update_sql = (
        "UPDATE youxinpai2 SET is_sellable = %s, price = %s, update_dt=%s, sold_dt=%s"
        " WHERE url = %s")
    update_list = []
    for key, value in db_dict.items():
        value.append(key)
        update_list.append(value)
    print(len(update_list))
    update_batch(update_sql, update_list)
    print(u"本次出售状态更新完毕")
Exemplo n.º 25
0
def read_sql_file(path):
    for filename in list_files(path, "*.sql"):
        sql = ("INSERT INTO {table}(cate1, cate2, cate3, cate4, cate5, cate6, cate7, time, plat,"
            " name, sales, amount, deals) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            "ON DUPLICATE KEY UPDATE sales=VALUES(sales), amount=VALUES(amount), deals=VALUES(deals)")
        regex = re.compile("\(((\'[^\']+\',){10}\d+,\d+,\d+)\)")
        # 正则匹配
        with open(filename, "r", encoding="utf8") as a_file:
            for line in a_file:
                insert_list = []
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    match = regex.findall(line)
                    for row in match:
                        fields = row[0].replace("'", "").split(",")
                        # if len(fields) == 13 and fields[7] == "201601":
                        if len(fields) == 13:
                            insert_list.append(fields)
                    insert_batch(sql, insert_list)
            print(filename + " 更新完毕。。。")
Exemplo n.º 26
0
def main(fpath, pattern=None):
    for filename in list_files(fpath, pattern):
        sql = (
            "INSERT INTO {table} VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s)")
        # 正则匹配
        regex = re.compile("\(([^\)]+)\)[,;]+")
        with open(filename, "r", encoding="utf8") as a_file:
            print(filename + " starts to insert...", flush=True)
            # 匹配数据
            insert_list, line_count = [], 0
            for line in a_file:
                # 匹配表名
                if line.startswith("LOCK TABLES"):
                    table = re.search(r"`(.+)`", line).group(0).strip("`")
                    sql = sql.format(table=table)
                    print(sql)
                if line.startswith("INSERT INTO"):
                    # 计数器加1
                    line_count += 1
                    match = regex.findall(line)
                    for row in match:
                        fields = row.replace("'", "").split(",")
                        # 舍弃正则解析出错的记录(数量很少)
                        if len(fields) != 15:
                            print(fields, flush=True)
                            continue
                        if fields[9] == "1":
                            continue
                        insert_list.append(fields)
                    if line_count >= 2000:
                        print("To insert " + str(len(insert_list)) + " records")
                        insert_batch(sql, insert_list)
                        insert_list, line_count = [], 0

            insert_batch(sql, insert_list)
            print("Last ", len(insert_list), " inserted...", flush=True)
Exemplo n.º 27
0
def wuba_ayi(path):
    sql = ("INSERT INTO 58_ayi(address, age, comment, commentcountofthismonth, distance, "
        " distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, province, "
        " servicecount, servicecountofthismonth, sex, star, uid, valid, workage, worktime, "
        " city, capturedate, appointmentdate) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
        " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    for file_name in list_files(path):
        with open(file_name, "r", encoding="utf8") as a_file:
            print(file_name + " starts to insert...")
            insert_list, row_count = [], 0
            # 跳过Header
            next(a_file)
            for line in a_file:
                fields = line.strip().split(",")
                if len(fields)  == 24:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    address, age, comment, commentcountofthismonth, distance, distancewithunit, \
                    entrytime, goodrate, idcard, mobile, NAME, pic, province, servicecount, \
                    servicecountofthismonth, sex, star, uid, valid, workage, worktime, city, \
                    capturedate, appointmentdate = fields[0], fields[1], fields[2], fields[3], \
                    fields[4], fields[5], fields[6], fields[7], fields[8] ,fields[9], fields[10], fields[11], \
                    fields[12], fields[13], fields[14], fields[15], fields[16], fields[17], \
                    fields[18], fields[19], fields[20], fields[21], fields[22], fields[23]
                    insert_list.append([address, age, comment, commentcountofthismonth, distance, \
                    distancewithunit, entrytime, goodrate, idcard, mobile, NAME, pic, \
                    province, servicecount, servicecountofthismonth, sex, star, uid, valid, \
                    workage, worktime, city, capturedate, appointmentdate])
                    # 如果超过100000行的话,重置计数器
                    if row_count >= 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...")
            insert_batch(sql, insert_list)
Exemplo n.º 28
0
def didi(path):
    sql = ("INSERT INTO didi(driver_id, driver_name, license, longitude, latitude, "
        " order_num, company, capture_dtm, city, flag_dt, createddate)"
        " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")

    for file_name in list_files(path):
        print(file_name)
        with open(file_name, "r", encoding="utf8") as a_file:
            insert_list, row_count = [], 0
            for line in a_file:
                print(line)
                fields = line.split("\t")
                length = len(fields)
                if length == 10:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 11:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9], fields[10]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 12:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[10], fields[11]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 19:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \
                    fields[14], fields[15], fields[16], fields[17], fields[18]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 20:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, \
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, None, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, company, \
                    capture_dtm, city = re.findall(r"\d+", fields[9])[0], fields[10], fields[11], \
                    fields[14], fields[15], fields[16], fields[17], fields[18], fields[19]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
                elif length == 21:
                    # 计数器加1
                    row_count += 1
                    # 开始取值
                    driver_id, driver_name, license, longitude, latitude, order_num, company,\
                    capture_dtm, city = fields[0], fields[1], fields[2], fields[5], fields[6], \
                        fields[7], fields[8], fields[9], fields[10]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])

                    driver_id, driver_name, license, longitude, latitude, order_num, company, \
                    capture_dtm, city = re.findall(r"\d+", fields[10])[0], fields[11], fields[12], \
                    fields[15], fields[16], fields[17], fields[18], fields[19], fields[20]
                    insert_list.append([driver_id, driver_name, license, longitude, latitude, \
                        order_num, company, capture_dtm, city, capture_dtm.split(" ")[0], TODAY])
                    # 如果超过100000行的话,重置计数器
                    if row_count > 100000:
                        print("100000 inserted...")
                        insert_batch(sql, insert_list)
                        insert_list, row_count = [], 0
            # 把最后没有到100000条的记录插入数据库
            print("Last ", len(insert_list), "inserted...")
            insert_batch(sql, insert_list)
Exemplo n.º 29
0
def main(path, pattern=None):
    for filename in list_files(path, pattern):
        match = re.search(r"(\d{4}-\d{2})", filename)
        if match:
            starting_date = match.group(0) + "-01"

        all_lines = []
        with open(filename, encoding="utf8") as a_csv:
            spam_reader = csv.reader(a_csv, delimiter=",")
            # 跳过第一行Title
            next(spam_reader)
            for row in spam_reader:
                line = []
                # app_id
                line.append(row[8])
                # app_name
                line.append(row[9])
                # app_url
                line.append(row[13])
                # app_iap
                line.append(row[14])
                # region
                line.append(row[5])
                # os
                line.append("iOS")
                # device
                line.append(row[3])
                # Source
                line.append("Appannie")
                # store
                line.append("Apple")
                # sector
                line.append(row[1])
                # sub_sector
                line.append(row[15])
                # type
                line.append(row[4])
                # rank
                line.append(row[0])
                # starting_date
                line.append(starting_date)
                # date_type
                line.append("30")
                # download_or_revenue
                line.append(row[10])
                # unit
                line.append(row[11])
                # app_average_price_usd
                line.append(row[19])
                # avg_rating_all
                line.append(row[27])
                # app_version
                line.append(row[7])
                # app_release_date
                line.append(row[20])
                # publisher_id
                line.append(row[21])
                # publisher_name
                line.append(row[22])
                # company_name
                line.append(row[23])
                # parent_company_name
                line.append(row[24])

                all_lines.append(line)

        insert_sql = (
            "INSERT INTO app_download(app_id, app_name, app_url, app_iap, region, os,"
            " device, source, store, sector, sub_sector, type, rank, starting_date, date_type,"
            " download_or_revenue, unit, app_average_price_usd, avg_rating_all, app_version, "
            " app_release_date, publisher_id, publisher_name, company_name, parent_company_name)"
            " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,"
            " %s, %s, %s, %s, %s, %s)")
        insert_batch(insert_sql, all_lines)
        print(filename + "*************************Done.")