def split_test_set(user_pay_path: str, user_pay_train_path: str, test_path: str, split_data: str): date_count = {} day_pays = {} user_pay_reader = tool.Reader(user_pay_path) user_pay_train_writer = tool.Writer(user_pay_train_path) test_writer = tool.Writer(test_path) split_time = time.strptime(split_data, '%Y-%m-%d') split_datetime = datetime.datetime(split_time.tm_year, split_time.tm_mon, split_time.tm_mday) for row in user_pay_reader: row_time = time.strptime(row[2], '%Y-%m-%d %H:%M:%S') if row_time.tm_year >= split_time.tm_year and row_time.tm_mon >= split_time.tm_mon and row_time.tm_mday >= split_time.tm_mday: key = (row[1], row_time.tm_year, row_time.tm_mon, row_time.tm_mday) date_count.setdefault(key, 0) date_count[key] += 1 else: user_pay_train_writer.write_row(row) for key, count in date_count.items(): shop = int(key[0]) day_pays.setdefault(shop, {d: 0 for d in range(configure.predict_weeks)}) for d in range(configure.predict_weeks): current_datetime = datetime.datetime(*key[1:]) day_difference = (current_datetime - split_datetime).days if day_difference < configure.predict_weeks: day_pays[shop][day_difference] = count for shop, pays in sorted(day_pays.items()): row = [shop] + [p for _, p in sorted(pays.items())] test_writer.write_row(row)
def shop_pay_list(shop_pay_count_path: str, shop_pay_list_path: str, remind_days: int): shop_pay_dict = {} start_date = datetime.datetime(3000, 1, 1) end_date = datetime.datetime(1, 1, 1) shop_pay_count_reader = tool.Reader(shop_pay_count_path) for row in shop_pay_count_reader: shop = int(row[0]) date = datetime.datetime(int(row[1]), int(row[2]), int(row[3])) shop_pay_dict.setdefault(shop, {}) shop_pay_dict[shop][date] = int(row[5]) if date < start_date: start_date = date elif date > end_date: end_date = date if remind_days != 0: start_date = end_date - datetime.timedelta(days=remind_days - 1) days = remind_days else: days = (end_date - start_date).days + 1 shop_pay_list_train_writer = tool.Writer(shop_pay_list_path) for shop, pay_dict in sorted(shop_pay_dict.items()): pay_list = [] week_pay_list = {w: [] for w in range(7)} for d in range(days): current_date = start_date + datetime.timedelta(days=d) if current_date in pay_dict: pay_list.append(pay_dict[current_date]) week_pay_list[current_date.weekday()].append( int(pay_dict[current_date])) else: pay_list.append(0) week_pay_list[current_date.weekday()].append(0) shop_pay_list_train_writer.write_row([shop] + pay_list)
def predict_median(user_pay_list_path: str, result_path: str, train_weeks: int, offset: int = 0): shop_pay_list_reader = tool.Reader(user_pay_list_path) result_writer = tool.Writer(result_path) for row in shop_pay_list_reader: pay_list = row[1:] pay_list.reverse() result = [] for d in range(7): _offset = offset new_list = [] while offset < len(pay_list) - train_weeks * 7: week_pays = [ int(pay_list[-d + (w + 1) * 7 - 1 + _offset]) for w in range(train_weeks) ] for n in week_pays: if n != 0: new_list.append(n) if new_list: break _offset += 7 new_list = [] m = int(compute.median(new_list)) result.append(m) result *= 2 result_writer.write_row([row[0]] + result)
def feature_join(*features_path): shop_feature = {} for feature_path in features_path[-1:]: feature_reader = tool.Reader(feature_path) for row in feature_reader: shop_feature.setdefault(row[0], []) shop_feature[row[0]] += row[1:] feature_writer = tool.Writer(features_path[-1]) for shop, feature in sorted(shop_feature.items()): feature_writer.write_row([shop] + feature)
def shop_info_to_vector(input_path: str, output_path: str, with_city: bool, with_cate: bool): data = [] city_set = set() city = {} cate1_set = set() cate1 = {} cate2_set = set() cate2 = {} cate3_set = set() cate3 = {} shop_info_reader = tool.Reader(input_path) for row in shop_info_reader: if row[9] == '': row[9] = row[8] if with_city: city_set.add(row[1]) if with_cate: cate1_set.add(row[7]) cate2_set.add(row[8]) cate3_set.add(row[9]) data.append(row) if with_city: city_sum = len(city_set) for index, city_name in zip(range(city_sum), city_set): city_vector = [0] * city_sum city_vector[index] = 1 city[city_name] = city_vector if with_cate: cate1_sum = len(cate1_set) for index, cate1_name in zip(range(cate1_sum), cate1_set): cate1_vector = [0] * cate1_sum cate1_vector[index] = 1 cate1[cate1_name] = cate1_vector cate2_sum = len(cate2_set) for index, cate2_name in zip(range(cate2_sum), cate2_set): cate2_vector = [0] * cate2_sum cate2_vector[index] = 1 cate2[cate2_name] = cate2_vector cate3_sum = len(cate3_set) for index, cate3_name in zip(range(cate3_sum), cate3_set): cate3_vector = [0] * cate3_sum cate3_vector[index] = 1 cate3[cate3_name] = cate3_vector shop_info_vector_writer = tool.Writer(output_path) for row in data: vector = [row[0]] + row[3:7] if with_city: vector += city[row[1]] if with_cate: vector += cate1[row[7]] + cate2[row[8]] + cate3[row[9]] shop_info_vector_writer.write_row(vector)
def generate_recurrence_data(user_pay_list_path: str, recurrence_data_path: str, week_window: int): user_pay_list_reader = tool.Reader(user_pay_list_path) recurrence_data_writer = tool.Writer(recurrence_data_path) user_pay = user_pay_list_reader.read(numeric=True) for row in user_pay: data = row[1:] for i in range( len(data) // 7 - week_window - configure.predict_weeks + 1): offset = i * 7 current_data = data[offset:offset + (week_window + configure.predict_weeks) * 7] recurrence_data_writer.write_row([row[0]] + current_data)
def shop_pay_count(input_path, output_path): date_count = {} user_pay_reader = tool.Reader(input_path) for row in user_pay_reader: row_time = time.strptime(row[2], '%Y-%m-%d %H:%M:%S') key = (row[1], row_time.tm_year, row_time.tm_mon, row_time.tm_mday, row_time.tm_wday) date_count.setdefault(key, 0) date_count[key] += 1 user_pay_count_writer = tool.Writer(output_path) for key, count in date_count.items(): row = list(key) + [count] user_pay_count_writer.write_row(row)
def k_means_cluster_shop(shop_pay_list_path: str, cluster_path: str, train_weeks: int, k: int): shop_pay_list_reader = tool.Reader(shop_pay_list_path) result_writer = tool.Writer(cluster_path) shop_list = [] rate = [] for row in shop_pay_list_reader: shop = int(row[0]) shop_list.append(shop) pay_list = row[-train_weeks * 3:] extern = [pay_list[0]] + pay_list rate_list = [ (int(extern[i + 1]) / int(extern[i]) if int(extern[i]) != 0 else 0) for i in range(len(pay_list)) ] rate.append(rate_list) group = [[] for _ in range(k)] k_means = KMeans(n_clusters=k).fit(rate) for s, c in enumerate(k_means.labels_): group[c].append(shop_list[s]) for shops in group: result_writer.write_row(shops)