def get_predict_post_body(model_stats, day_list, day_list_cut, page_ix, pf_age, pf_si, pf_network, pf_gender, full_record, hits, pf_price_cat, predict_day_list, forward_offset): train_window = model_stats['model']['train_window'] # comes from cfg predict_window = model_stats['model']['predict_window'] # comes from cfg x_hits = np.log(np.add(hits, 1)).tolist() # ln + 1 full_record_exp = np.log(np.add(full_record, 1)).tolist() if len(day_list_cut) != train_window + predict_window: raise Exception( 'day_list_cut and train window + predicti_window do not match. {} {} {}' .format(len(day_list_cut), train_window, predict_window)) dow = get_dow(day_list_cut) dow = [[dow[0][i], dow[1][i]] for i in range(train_window + predict_window)] for x in predict_day_list: if x not in day_list: day_list.extend(predict_day_list) lagged_indx = np.stack(lag_indexes(day_list), axis=-1) # not used in the model (but we should keep it) page_popularity = np.median(full_record_exp) page_popularity = ( page_popularity - model_stats['stats']['page_popularity'][0]) / \ model_stats['stats']['page_popularity'][1] quarter_autocorr = 1 duration = model_stats['model']['duration'] # x_hits, x_features, norm_x_hits, x_lagged, y_features, mean, std, flat_ucdoc_features, page_ix truex, timex, normx, laggedx, timey, normmean, normstd, pgfeatures, pageix = make_pred_input( duration, train_window, predict_window, full_record_exp, x_hits, dow, lagged_indx, pf_age, pf_si, pf_network, pf_gender, page_ix, pf_price_cat, page_popularity, quarter_autocorr, forward_offset) # ys are not important] truey = [1 for _ in range(predict_window)] normy = [1 for _ in range(predict_window)] instance = { "truex": truex, "timex": timex.tolist(), "normx": normx, "laggedx": laggedx.tolist(), "truey": truey, "timey": timey.tolist(), "normy": normy, "normmean": normmean, "normstd": normstd, "page_features": pgfeatures.tolist(), "pageix": pageix } # print(instance) return instance # , stat
def run(cfg): with open(cfg['tf_statistics_path'], 'rb') as f: tf_stat = pickle.load(f) names = [] tfrecord_location = cfg['tfrecords_local_path'] for file in os.listdir(tfrecord_location): if file.startswith("part"): names.append(file) file_paths = [os.path.join(tfrecord_location, name) for name in names] # read and make the dataset from tfrecord dataset = tf.data.TFRecordDataset(file_paths) dataset = dataset.map(__data_parser) batch_size = cfg['batch_size'] duration = cfg['duration'] dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER) iterator = dataset.make_one_shot_iterator() next_el = iterator.get_next() # lagged_ix = numpy.ones((duration, 4), dtype=float) # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix) lagged_ix = np.stack(lag_indexes(tf_stat), axis=-1) # quarter_autocorr = numpy.ones((batch_size,), dtype=float) date_list = tf_stat['days'] dow = get_dow(date_list) holiday_list = cfg['holidays'] holidays = [1 if _ in holiday_list else 0 for _ in date_list] a_list = [] b_list = [] for _ in holidays: a, b = holiday_norm(_) a_list.append(a) b_list.append(b) holiday = (a_list, b_list) with tf.Session() as sess: x = sess.run(next_el) quarter_autocorr = numpy.ones((x[0].size, ), dtype=float) page_indx = list(x[0]) fill_isolated_zeros(x[21]) tensors = dict( hits=pd.DataFrame(x[21], index=page_indx, columns=date_list), lagged_ix=lagged_ix, page_ix=page_indx, pf_age=pd.DataFrame(x[8:15], columns=page_indx, index=(1, 2, 3, 4, 5, 6, 7)).T, pf_si=pd.DataFrame(x[20], index=page_indx), pf_network=pd.DataFrame(x[15:20], columns=page_indx, index=('2G', '3G', '4G', 'UNKNOWN', 'WIFI')).T, pf_price_cat=pd.DataFrame(x[1:4], columns=page_indx, index=('pc1', 'pc2', 'pc3')).T, pf_gender=pd.DataFrame(x[4:8], columns=page_indx, index=('none', 'f', 'm', 'x')).T, page_popularity=x[22], # page_popularity = quarter_autocorr, quarter_autocorr=quarter_autocorr, dow=pd.DataFrame(dow).T, holiday=pd.DataFrame(holiday).T) data_len = tensors['hits'].shape[1] plain = dict(data_days=data_len - cfg['add_days'], features_days=data_len, data_start=date_list[0], data_end=date_list[-1], features_end=date_list[-1], n_pages=batch_size) VarFeeder(cfg['data_dir'], tensors, plain)
def get_predict_post_body(model_stats, day_list, day_list_cut, uckey, age, si, network, gender, media, ip_location, full_record, hits, hour, price_cat): price_cat = str(price_cat) hour = str(hour) train_window = model_stats['model']['train_window'] # comes from cfg predict_window = model_stats['model']['predict_window'] # comes from cfg x_hits = np.log(np.add(hits, 1)).tolist() # ln + 1 full_record_exp = np.log(np.add(full_record, 1)).tolist() if len(day_list_cut) != train_window + predict_window: raise Exception( 'day_list_cut and train window + predicti_window do not match. {} {} {}' .format(len(day_list_cut), train_window, predict_window)) dow = get_dow(day_list_cut) dow = [[dow[0][i], dow[1][i]] for i in range(train_window + predict_window)] lagged_indx = np.stack(lag_indexes(day_list), axis=-1) # lagged_hits = [0 for i in range(2)] # lagged_hits = [lagged_hits for _ in range(train_window+predict_window)] m = model_stats['stats'] pf_age = [(int(age == '1') - m['a_1'][0]) / m['a_1'][1], (int(age == '2') - m['a_2'][0]) / m['a_2'][1], (int(age == '3') - m['a_3'][0]) / m['a_3'][1], (int(age == '4') - m['a_4'][0]) / m['a_4'][1]] pf_si = [(int(si == '1') - m['si_1'][0]) / m['si_1'][1], (int(si == '2') - m['si_2'][0]) / m['si_2'][1], (int(si == '3') - m['si_3'][0]) / m['si_3'][1]] pf_network = [(int(network == '3G') - m['t_3G'][0]) / m['t_3G'][1], (int(network == '4G') - m['t_4G'][0]) / m['t_4G'][1], (int(network == '5G') - m['t_5G'][0]) / m['t_5G'][1]] pf_gender = [(int(gender == 'g_f') - m['g_g_f'][0]) / m['g_g_f'][1], (int(gender == 'g_m') - m['g_g_m'][0]) / m['g_g_m'][1], (int(gender == 'g_x') - m['g_g_x'][0]) / m['g_g_x'][1]] pf_price_cat = [ (int(price_cat == '0') - m['price_cat_0'][0]) / m['price_cat_0'][1], (int(price_cat == '1') - m['price_cat_1'][0]) / m['price_cat_1'][1], (int(price_cat == '2') - m['price_cat_2'][0]) / m['price_cat_2'][1], (int(price_cat == '3') - m['price_cat_3'][0]) / m['price_cat_3'][1] ] page_ix = ','.join([uckey, price_cat, hour]) # not used page_popularity = np.median(full_record_exp) page_popularity = (page_popularity - model_stats['stats']['page_popularity'][0] ) / model_stats['stats']['page_popularity'][1] quarter_autocorr = 1 duration = model_stats['model']['duration'] # x_hits, x_features, norm_x_hits, x_lagged, y_features, mean, std, flat_ucdoc_features, page_ix truex, timex, normx, laggedx, timey, normmean, normstd, pgfeatures, pageix = make_pred_input( duration, train_window, predict_window, full_record_exp, x_hits, dow, lagged_indx, pf_age, pf_si, pf_network, pf_gender, page_ix, pf_price_cat, page_popularity, quarter_autocorr) # ys are not important] truey = [1 for _ in range(predict_window)] normy = [1 for _ in range(predict_window)] instance = { "truex": truex, "timex": timex.tolist(), "normx": normx, "laggedx": laggedx.tolist(), "truey": truey, "timey": timey.tolist(), "normy": normy, "normmean": normmean, "normstd": normstd, "page_features": pgfeatures.tolist(), "pageix": pageix } # print(instance) return instance # , stat