Exemplo n.º 1
0
def split_test_set(user_pay_path: str, user_pay_train_path: str,
                   test_path: str, split_data: str):
    date_count = {}
    day_pays = {}
    user_pay_reader = tool.Reader(user_pay_path)
    user_pay_train_writer = tool.Writer(user_pay_train_path)
    test_writer = tool.Writer(test_path)
    split_time = time.strptime(split_data, '%Y-%m-%d')
    split_datetime = datetime.datetime(split_time.tm_year, split_time.tm_mon,
                                       split_time.tm_mday)
    for row in user_pay_reader:
        row_time = time.strptime(row[2], '%Y-%m-%d %H:%M:%S')
        if row_time.tm_year >= split_time.tm_year and row_time.tm_mon >= split_time.tm_mon and row_time.tm_mday >= split_time.tm_mday:
            key = (row[1], row_time.tm_year, row_time.tm_mon, row_time.tm_mday)
            date_count.setdefault(key, 0)
            date_count[key] += 1
        else:
            user_pay_train_writer.write_row(row)
    for key, count in date_count.items():
        shop = int(key[0])
        day_pays.setdefault(shop,
                            {d: 0
                             for d in range(configure.predict_weeks)})
        for d in range(configure.predict_weeks):
            current_datetime = datetime.datetime(*key[1:])
            day_difference = (current_datetime - split_datetime).days
            if day_difference < configure.predict_weeks:
                day_pays[shop][day_difference] = count
    for shop, pays in sorted(day_pays.items()):
        row = [shop] + [p for _, p in sorted(pays.items())]
        test_writer.write_row(row)
Exemplo n.º 2
0
def shop_pay_list(shop_pay_count_path: str, shop_pay_list_path: str,
                  remind_days: int):
    shop_pay_dict = {}
    start_date = datetime.datetime(3000, 1, 1)
    end_date = datetime.datetime(1, 1, 1)
    shop_pay_count_reader = tool.Reader(shop_pay_count_path)
    for row in shop_pay_count_reader:
        shop = int(row[0])
        date = datetime.datetime(int(row[1]), int(row[2]), int(row[3]))
        shop_pay_dict.setdefault(shop, {})
        shop_pay_dict[shop][date] = int(row[5])
        if date < start_date:
            start_date = date
        elif date > end_date:
            end_date = date
    if remind_days != 0:
        start_date = end_date - datetime.timedelta(days=remind_days - 1)
        days = remind_days
    else:
        days = (end_date - start_date).days + 1
    shop_pay_list_train_writer = tool.Writer(shop_pay_list_path)
    for shop, pay_dict in sorted(shop_pay_dict.items()):
        pay_list = []
        week_pay_list = {w: [] for w in range(7)}
        for d in range(days):
            current_date = start_date + datetime.timedelta(days=d)
            if current_date in pay_dict:
                pay_list.append(pay_dict[current_date])
                week_pay_list[current_date.weekday()].append(
                    int(pay_dict[current_date]))
            else:
                pay_list.append(0)
                week_pay_list[current_date.weekday()].append(0)
        shop_pay_list_train_writer.write_row([shop] + pay_list)
Exemplo n.º 3
0
def predict_median(user_pay_list_path: str,
                   result_path: str,
                   train_weeks: int,
                   offset: int = 0):
    shop_pay_list_reader = tool.Reader(user_pay_list_path)
    result_writer = tool.Writer(result_path)
    for row in shop_pay_list_reader:
        pay_list = row[1:]
        pay_list.reverse()
        result = []
        for d in range(7):
            _offset = offset
            new_list = []
            while offset < len(pay_list) - train_weeks * 7:
                week_pays = [
                    int(pay_list[-d + (w + 1) * 7 - 1 + _offset])
                    for w in range(train_weeks)
                ]
                for n in week_pays:
                    if n != 0:
                        new_list.append(n)
                if new_list:
                    break
                _offset += 7
                new_list = []
            m = int(compute.median(new_list))
            result.append(m)
        result *= 2
        result_writer.write_row([row[0]] + result)
Exemplo n.º 4
0
def feature_join(*features_path):
    shop_feature = {}
    for feature_path in features_path[-1:]:
        feature_reader = tool.Reader(feature_path)
        for row in feature_reader:
            shop_feature.setdefault(row[0], [])
            shop_feature[row[0]] += row[1:]
    feature_writer = tool.Writer(features_path[-1])
    for shop, feature in sorted(shop_feature.items()):
        feature_writer.write_row([shop] + feature)
Exemplo n.º 5
0
def shop_info_to_vector(input_path: str, output_path: str, with_city: bool,
                        with_cate: bool):
    data = []
    city_set = set()
    city = {}
    cate1_set = set()
    cate1 = {}
    cate2_set = set()
    cate2 = {}
    cate3_set = set()
    cate3 = {}
    shop_info_reader = tool.Reader(input_path)
    for row in shop_info_reader:
        if row[9] == '':
            row[9] = row[8]
        if with_city:
            city_set.add(row[1])
        if with_cate:
            cate1_set.add(row[7])
            cate2_set.add(row[8])
            cate3_set.add(row[9])
        data.append(row)
    if with_city:
        city_sum = len(city_set)
        for index, city_name in zip(range(city_sum), city_set):
            city_vector = [0] * city_sum
            city_vector[index] = 1
            city[city_name] = city_vector
    if with_cate:
        cate1_sum = len(cate1_set)
        for index, cate1_name in zip(range(cate1_sum), cate1_set):
            cate1_vector = [0] * cate1_sum
            cate1_vector[index] = 1
            cate1[cate1_name] = cate1_vector
        cate2_sum = len(cate2_set)
        for index, cate2_name in zip(range(cate2_sum), cate2_set):
            cate2_vector = [0] * cate2_sum
            cate2_vector[index] = 1
            cate2[cate2_name] = cate2_vector
        cate3_sum = len(cate3_set)
        for index, cate3_name in zip(range(cate3_sum), cate3_set):
            cate3_vector = [0] * cate3_sum
            cate3_vector[index] = 1
            cate3[cate3_name] = cate3_vector
    shop_info_vector_writer = tool.Writer(output_path)
    for row in data:
        vector = [row[0]] + row[3:7]
        if with_city:
            vector += city[row[1]]
        if with_cate:
            vector += cate1[row[7]] + cate2[row[8]] + cate3[row[9]]
        shop_info_vector_writer.write_row(vector)
def generate_recurrence_data(user_pay_list_path: str,
                             recurrence_data_path: str, week_window: int):
    user_pay_list_reader = tool.Reader(user_pay_list_path)
    recurrence_data_writer = tool.Writer(recurrence_data_path)
    user_pay = user_pay_list_reader.read(numeric=True)
    for row in user_pay:
        data = row[1:]
        for i in range(
                len(data) // 7 - week_window - configure.predict_weeks + 1):
            offset = i * 7
            current_data = data[offset:offset +
                                (week_window + configure.predict_weeks) * 7]
            recurrence_data_writer.write_row([row[0]] + current_data)
Exemplo n.º 7
0
def shop_pay_count(input_path, output_path):
    date_count = {}
    user_pay_reader = tool.Reader(input_path)
    for row in user_pay_reader:
        row_time = time.strptime(row[2], '%Y-%m-%d %H:%M:%S')
        key = (row[1], row_time.tm_year, row_time.tm_mon, row_time.tm_mday,
               row_time.tm_wday)
        date_count.setdefault(key, 0)
        date_count[key] += 1
    user_pay_count_writer = tool.Writer(output_path)
    for key, count in date_count.items():
        row = list(key) + [count]
        user_pay_count_writer.write_row(row)
Exemplo n.º 8
0
def k_means_cluster_shop(shop_pay_list_path: str, cluster_path: str,
                         train_weeks: int, k: int):
    shop_pay_list_reader = tool.Reader(shop_pay_list_path)
    result_writer = tool.Writer(cluster_path)
    shop_list = []
    rate = []
    for row in shop_pay_list_reader:
        shop = int(row[0])
        shop_list.append(shop)
        pay_list = row[-train_weeks * 3:]
        extern = [pay_list[0]] + pay_list
        rate_list = [
            (int(extern[i + 1]) / int(extern[i]) if int(extern[i]) != 0 else 0)
            for i in range(len(pay_list))
        ]
        rate.append(rate_list)
    group = [[] for _ in range(k)]
    k_means = KMeans(n_clusters=k).fit(rate)
    for s, c in enumerate(k_means.labels_):
        group[c].append(shop_list[s])
    for shops in group:
        result_writer.write_row(shops)