def test_long_dense_vector(): feature_columns = [ SparseFeat( 'user_id', 4, ), SparseFeat( 'item_id', 5, ), DenseFeat("pic_vec", 5) ] fixlen_feature_names = get_fixlen_feature_names(feature_columns) user_id = np.array([[1], [0], [1]]) item_id = np.array([[3], [2], [1]]) pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2]]) label = np.array([1, 0, 1]) input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec} model_input = [input_dict[name] for name in fixlen_feature_names] model = DeepFM(feature_columns, feature_columns[:-1]) model.compile('adagrad', 'binary_crossentropy') model.fit(model_input, label)
def load_stats(self): fixlen_feature_columns = [SparseFeat(feat, self.cat_meta[feat]) for feat in self.sparse_features] + [DenseFeat(feat, 1,) for feat in self.dense_features] self.dnn_feature_columns = fixlen_feature_columns self.linear_feature_columns = fixlen_feature_columns self.fixlen_feature_names = get_fixlen_feature_names(self.linear_feature_columns + self.dnn_feature_columns)
def get_xy_fd(hash_flag=False): # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat( # 'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)], # "dense": [SingleFeat('score', 0)]} feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 0) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in feature_names ] + [feature_dict[name] for name in varlen_feature_names] # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_xy_fd(use_neg=False, hash_flag=False): feature_columns = [SparseFeat('user', 3,hash_flag), SparseFeat('gender', 2,hash_flag), SparseFeat('item', 3+1,hash_flag), SparseFeat('item_gender', 2+1,hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item","item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3])#0 is mask value igender = np.array([1, 2, 1])# 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]]) hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]]) behavior_length = np.array([3,3,2]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in # feature_dim_dict["dense"]] + [ # feature_dict['hist_' + feat] for feat in behavior_feature_list] if use_neg: feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list] feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) print(varlen_feature_names) x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names] x += [behavior_length] y = [1, 0, 1] print(len(x)) return x, y, feature_columns, behavior_feature_list
def main(): Use_SF = False if len(sys.argv) > 0 and sys.argv[0] == 'SF': Use_SF = True train, vali, test = GetFeatures(Use_SF) feature_count = [] for feat in sparse_features: print("Fitting {}".format(feat)) labels = {} for x in train[feat]: if x not in labels: labels[x] = len(labels) + 1 print("Transforming {}".format(feat)) for df in [train, vali, test]: df[feat] = df[feat].map(lambda x: labels.get(x, 0)) feature_count.append(len(labels) + 1) sparse_feature_columns = [ SparseFeat(f, f_c) for f, f_c in zip(sparse_features, feature_count) ] dense_feature_columns = [DenseFeat(f, 1) for f in dense_features] fixlen_feature_columns = sparse_feature_columns + dense_feature_columns dnn_feature_columns = fixlen_feature_columns linear_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) train_model_input = [train[name] for name in fixlen_feature_names] vali_model_input = [vali[name] for name in fixlen_feature_names] test_model_input = [test[name] for name in fixlen_feature_names] def eval(target): model, history = model_generate(train_model_input, train[[target]], vali_model_input, vali[[target]], linear_feature_columns, dnn_feature_columns) pred_ans = model.predict(test_model_input, batch_size=256) print(target + " test LogLoss", round(log_loss(test[target].values, pred_ans), 4)) print(target + " test AUC", round(roc_auc_score(test[target].values, pred_ans), 4)) for target in targets: eval(target)
def read(data, lbe_store): # data['time'] = data['time'].apply(lambda x: timestamp(x), convert_dtype='int32') sparse_features = ["user_id", "item_id", "item_category", "time"] # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: data[feat] = lbe_store[feat].transform(data[feat]) # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) data_model_input = [data[name].values for name in fixlen_feature_names] return data, data_model_input
def get_xy_fd(): feature_columns = [ SparseFeat('user', 3), SparseFeat('gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1), DenseFeat('score', 1) ] feature_columns += [ VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender', 3 + 1, maxlen=4, embedding_name='item_gender') ] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = { 'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score } fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names ] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def get_train_instances(self, train): users, checkins, cand_venues, labels = [], [], [], [] for u in self.trainSeq: visited = self.trainSeq[u] checkin_ = [] for v in visited[:-1]: checkin_.append(v) checkins.extend(sequence.pad_sequences([checkin_[:]], maxlen=self.maxVenue)) # start from the second venue in user's checkin sequence. visited = visited[1:] for i in range(len(visited)): cand_venues.append(visited[i]) users.append(u) labels.append(1) j = np.random.randint(self.uNum) # check if j is in training dataset or in user's sequence at state i or not while (u, j) in train or j in visited[:i]: j = np.random.randint(self.uNum) cand_venues.append(j) users.append(u) labels.append(0) sess_number = np.ones(len(labels)) users = np.array(users) items = np.array(cand_venues) sess_item = np.array(checkins) labels = np.array(labels) feature_dict = {'user': users, 'item': items, 'score': labels, 'sess_0_item': sess_item} fixlen_feature_names = get_fixlen_feature_names(self.feature_columns) varlen_feature_names = get_varlen_feature_names(self.feature_columns) x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names] x += [sess_number] return x, labels
def get_xy_fd(hash_flag=False): feature_columns = [SparseFeat('user', 3, hash_flag), SparseFeat('gender', 2, hash_flag), SparseFeat('item', 3 + 1, hash_flag), SparseFeat('item_gender', 2 + 1, hash_flag), DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')] feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]]) sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]]) sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) sess_number = np.array([2, 1, 0]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score, 'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, } fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names] x += [sess_number] y = [1, 0, 1] return x, y, feature_columns, behavior_feature_list
def main(args): if args.arch == 'xDeepFM': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join( DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label s = time.time() print(f'before test article preprocess : {len(item)}') sparse_features = [ 'article_id', 'hh', 'gender', 'age_range', 'len_bin' ] dense_features = ['image_feature', 'read_cnt_prob'] target = ['label'] ############################ make more feature !!!!!!! ################################# ############## 1. read_article_ids len cnt -- user feature ################################################# len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() ################ 2. read_cnt, total_cnt, prob_read_cnt --- article feature #################################### read_cnt = item[item['label'] == 1].groupby('article_id').agg( {'hh': 'count'}) read_cnt = read_cnt.reset_index() read_cnt = read_cnt.rename(columns={'hh': 'read_cnt'}) read_cnt_list = read_cnt['read_cnt'].tolist() read_cnt_artic_list = read_cnt['article_id'].tolist() print(f'len read_cnt : {len(read_cnt)}') print(read_cnt.head(3)) total_cnt = item.groupby('article_id').agg({'hh': 'count'}) total_cnt = total_cnt.reset_index() total_cnt = total_cnt.rename(columns={'hh': 'read_cnt'}) total_cnt_list = total_cnt['read_cnt'].tolist() total_cnt_artic_list = total_cnt['article_id'].tolist() print(f'len read_cnt : {len(total_cnt)}') print(total_cnt.head(3)) # lit # test_article_ids list lit_cnt = [] lit_total_cnt = [] lit_cnt_prob = [] lit = list(set(artics)) lit.sort() print(lit[:10]) print(f'len(lit):{len(lit)}') for i in range(len(lit)): # lit_cnt cur_artic = lit[i] if cur_artic not in read_cnt_artic_list: lit_cnt.append(0) else: for j in range(len(read_cnt_artic_list)): if cur_artic == read_cnt_artic_list[j]: lit_cnt.append(read_cnt_list[j]) break # lit_total_cnt if cur_artic not in total_cnt_artic_list: lit_total_cnt.append(0) else: for j in range(len(total_cnt_artic_list)): if cur_artic == total_cnt_artic_list[j]: lit_total_cnt.append(total_cnt_list[j]) break # lit_cnt_prob if lit_total_cnt[i] == 0: lit_cnt_prob.append(0) else: lit_cnt_prob.append(lit_cnt[i] / lit_total_cnt[i]) print('--- read_cnt article feature completed ---') print(f'lit_cnt {len(lit_cnt)}') print(f'lit_total_cnt {len(lit_total_cnt)}') print(f'lit_cnt_prob {len(lit_cnt_prob)}') #### fea print('feature dict generate') file_list1 = os.listdir(DATASET_PATH) file_list2 = os.listdir(DATASET_PATH + '/train') file_list3 = os.listdir(DATASET_PATH + '/train/train_data') print(file_list1) print(file_list2) print(file_list3) resnet_feature_extractor(args.mode) print(file_list1) print(file_list2) print(file_list3) # One hot Encoding with open(os.path.join('train_image_features_50.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) print('check artic feature') print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}") lbe = LabelEncoder() lbe.fit(lit) item['article_id' + '_onehot'] = lbe.transform(item['article_id']) print(lbe.classes_) for feat in sparse_features[1:]: lbe = LabelEncoder() item[feat + '_onehot'] = lbe.fit_transform( item[feat]) # 이때 고친 라벨이 같은 라벨인지도 필수로 확인해야함 print(item.head(10)) print('columns name : ', item.columns) fixlen_feature_columns = [SparseFeat('article_id', len(lit))] fixlen_feature_columns += [ SparseFeat(feat, item[feat + '_onehot'].nunique()) for feat in sparse_features[1:] ] fixlen_feature_columns += [ DenseFeat('image_feature', len(image_feature_dict[artics[0]])) ] fixlen_feature_columns += [DenseFeat('read_cnt_prob', 1)] print(f'fixlen_feature_columns : {fixlen_feature_columns}') idx_artics_all = item['article_id' + '_onehot'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names( linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') print('---model defined---') print(time.time() - s, 'seconds') ##### print need for artic in lit: print(artic, end=',') print() print('new') print() print(len(lit_cnt_prob)) for prob in lit_cnt_prob: prob = round(prob, 4) print(prob, end=',') print() print('end') print('--------------') optimizer = tf.keras.optimizers.Adam(args.lr) s = time.time() # negative sampling item_pos = item[item['label'] == 1] item_neg = item[item['label'] == 0] dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42) dn_2 = item_neg.sample(n=3 * len(item_pos), random_state=20) dn_3 = item_neg.sample(n=3 * len(item_pos), random_state=7) dn_4 = item_neg.sample(n=3 * len(item_pos), random_state=33) dn_5 = item_neg.sample(n=3 * len(item_pos), random_state=41) dn_1.reset_index() data_1 = pd.concat([dn_1, item_pos]).sample(frac=1, random_state=42).reset_index() data_1_article_idxs = data_1['article_id_onehot'].tolist() data_1_article = data_1['article_id'].tolist() print(f'len data_1 : {len(data_1)}') print(data_1.head(5)) li1 = [] li2 = [] li3 = [] for i in range(len(data_1_article)): for j in range(len(lit_cnt_prob)): if data_1_article[i] == lit[j]: li3.append(lit_cnt_prob[j]) break data_1['read_cnt_prob'] = li3 print('---read_cnt_prob end---') ## preprocess append data_2 = pd.concat([dn_2, item_pos]).sample(frac=1, random_state=42).reset_index() data_3 = pd.concat([dn_3, item_pos]).sample(frac=1, random_state=42).reset_index() data_4 = pd.concat([dn_4, item_pos]).sample(frac=1, random_state=42).reset_index() data_5 = pd.concat([dn_5, item_pos]).sample(frac=1, random_state=42).reset_index() li = [] for i in range(len(data_1_article_idxs)): image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]] li.append(image_feature) print(f'article_id : {data_1_article[0]}') print(f'article_image_feature : {image_feature_dict[data_1_article[0]]}') data_1['image_feature'] = li li = [] print(f'finished data_1_image_feature : {time.time() - s} sec') if use_nsml: bind_nsml(model, optimizer, args.task) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train') or args.dry_run: best_loss = 1000 if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') model.compile( tf.keras.optimizers.Adam(args.lr), 'mse', metrics=['accuracy'], ) train_generator = data_generator(data_1) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) #k_fold 할때는 check point 빼자 save_cbk = CustomModelCheckpoint() history = model.fit_generator(train_generator, epochs=100, verbose=2, workers=8, steps_per_epoch=np.ceil( len(data_1) / 2048), callbacks=[lr_scheduler, save_cbk]) print('again')
data.loc[data.shape[0] + 1] = row sparse_features = ["user_id", "item_id", "item_category", "time"] target = ['behavior_type'] # 1.Label Encoding for sparse features,and do simple Transformation for dense features lbe_store = {} for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) lbe_store[feat] = lbe # 2.count #unique features for each sparse field fixlen_feature_columns = [ SparseFeat(feat, data[feat].nunique()) for feat in sparse_features ] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) # 3.generate input data for model data = data.sample(frac=0.001) train, test = train_test_split(data, test_size=0.2) # train = train[:1000] # test = test[:200] train_model_input = [train[name].values for name in fixlen_feature_names] test_model_input = [test[name].values for name in fixlen_feature_names] # 4.Define Model,train,predict and evaluate model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression') model.compile( "adam", "mse",
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': print('_infer root - : ', DATASET_PATH) print('test') model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH) bind_nsml(model, [], args.task) checkpoint_session = ['401','team_62/airush2/176'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) print('successfully loaded') if (args.mode == 'train'): if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') print('end_main') if args.pause: nsml.paused(scope=local)
def get_item(root): print('load') csv_file = os.path.join(root, 'test', 'test_data', 'test_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') print('loaded!!') sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) print('image_feaeture_dict loaded..') for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) # test set으로 구성해도 되고 item 을.. fixlen_feature_columns = [] for feat in sparse_features: if feat == 'article_id': fixlen_feature_columns.append(SparseFeat(feat,1896)) else: fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique())) #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features] print(fixlen_feature_columns) idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') #bind_nsml(model, list(), args.task) return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
import numpy as np from deepctr.models import DIN from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_fixlen_feature_names,get_varlen_feature_names feature_columns = [SparseFeat('user',3),SparseFeat( 'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)] feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'), VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')] behavior_feature_list = ["item", "item_gender"] uid = np.array([0, 1, 2]) ugender = np.array([0, 1, 0]) iid = np.array([1, 2, 3]) # 0 is mask value igender = np.array([1, 2, 1]) # 0 is mask value score = np.array([0.1, 0.2, 0.3]) hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]]) hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]]) feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender, 'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score} fixlen_feature_names = get_fixlen_feature_names(feature_columns) varlen_feature_names = get_varlen_feature_names(feature_columns) x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names] y = [1, 0, 1] model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, ) model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy']) history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
def main(args): if args.arch == 'MLP': model = get_mlp(num_classes=args.num_classes) elif args.arch == 'Resnet': model = get_resnet18(num_classes=args.num_classes) elif args.arch == 'xDeepFM': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join( DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label print(len(item)) sparse_features = [ 'article_id', 'hh', 'gender', 'age_range', 'len_bin' ] dense_features = ['image_feature'] target = ['label'] print(time.time() - s, 'seconds') s = time.time() len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) print(f'read_article_ids_all len : {len(read_article_ids_all)}') """ def extract_len_read_article(read_article_ids): if type(read_article_ids) == float: return 0 else : return len(read_article_ids.split(',')) read_article_ids_all = item['read_article_ids'].tolist() with Pool(processes=6) as p: len_lis = list(tqdm(p.imap(extract_len_read_article, read_article_ids_all), total=len(read_article_ids_all))) """ item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop') print('len_bin finished ', time.time() - s, 'seconds') id_to_artic = dict() artics = item['article_id'].tolist() with open( os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [ SparseFeat(feat, item[feat].nunique()) for feat in sparse_features ] fixlen_feature_columns += [ DenseFeat(feat, len(image_feature_dict[artics[0]])) for feat in dense_features ] print(artics[0]) print(fixlen_feature_columns) """ [SparseFeat(name='article_id', dimension=1896, use_hash=False, dtype='int32', embedding_name='article_id', embedding=True), SparseFeat(name='hh', dimension=24, use_hash=False, dtype='int32', embedding_name='hh', embedding=True), SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True), SparseFeat(name='age_range', dimension=9, use_hash=False, dtype='int32', embedding_name='age_range', embedding=True), SparseFeat(name='len_bin', dimension=5, use_hash=False, dtype='int32', embedding_name='len_bin', embedding=True), DenseFeat(name='image_feature', dimension=2048, dtype='float32')] """ print('---fixlen_feature_columns finished---') s = time.time() idx_artics_all = item['article_id'].tolist() print(f'idx_artics_all len : {len(idx_artics_all)}') print(f'artics len : {len(artics)}') for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] print(f'id_to_artic len : {len(id_to_artic)}') print(time.time() - s, 'seconds') #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names( linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 """ if args.use_gpu: model = model.cuda() else: model = model.cpu() """ optimizer = tf.keras.optimizers.Adam(args.lr) # negative sampling item_pos = item[item['label'] == 1] item_neg = item[item['label'] == 0] print(f'len item_pos : {len(item_pos)}') print(f'len item_neg : {len(item_neg)}') dn_1 = item_neg.sample(n=2 * len(item_pos), random_state=42) dn_1.reset_index() print(f'len dn_1 : {len(dn_1)}') data_1 = pd.concat([dn_1, item_pos]).sample(frac=1, random_state=42).reset_index() print(f'len data_1 : {len(data_1)}') print('--- negative sampling completed ---') s = time.time() data_1_article_idxs = data_1['article_id'].tolist() li = [] for i in range(len(data_1_article_idxs)): image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]] li.append(image_feature) print(f'len image_feature : {len(li)}') data_1['image_feature'] = li li = [] print(f'finished data_1_image_feature : {time.time() - s} sec') print(f'generate all x_train') if use_nsml: bind_nsml(model, optimizer, args.task) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train') or args.dry_run: best_loss = 1000 if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 model.compile( tf.keras.optimizers.Adam(args.lr), 'mse', metrics=['accuracy'], ) train_generator = data_generator(data_1) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) save_cbk = CustomModelCheckpoint() history = model.fit_generator(train_generator, epochs=200, verbose=2, workers=8, steps_per_epoch=np.ceil( len(data_1) / 2048), callbacks=[lr_scheduler, save_cbk]) print('again') """
def main(args, local): if args.arch == 'xDeepFM' and args.mode == 'train': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join(DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label s = time.time() #print(f'before test article preprocess : {len(item)}') #print(f'after test article preprocess : {len(item)}') #print(f'time : {time.time() - s}') sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature', 'read_cnt_prob'] target = ['label'] ############################ make more feature !!!!!!! ################################# ############## 1. read_article_ids len cnt -- user feature ################################################# len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() #print(item.head(3)) #print('columns name : ',item.columns) sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature', 'read_cnt_prob'] fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features] fixlen_feature_columns += [DenseFeat('image_feature',2048)] fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)] #print(f'fixlen_feature_columns : {fixlen_feature_columns}') linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'regression') print('---model defined---') #print(time.time() - s ,'seconds') if use_nsml and args.mode == 'train': bind_nsml(model,[], args.task) if args.mode == 'test': #print('_infer root - : ', DATASET_PATH) #print('test') #print('DATASET_PATH: ', DATASET_PATH) file_list= glob.glob(f'{DATASET_PATH}/test/test_data/*') #print('file_list: ',file_list) model, fixlen_feature_names_global, item, image_feature_dict,lit,lit_cnt_prob = get_item(DATASET_PATH,args.mode) bind_nsml(model, [], args.task) checkpoint_session = ['3','team_62/airush2/361'] nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) #print('successfully loaded') if (args.mode == 'train'): #print('DATASET_PATH: ', DATASET_PATH) #file_list= glob.glob(f'{DATASET_PATH}/train/train_data/*') #print('file_list :',file_list) if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 nsml.save('infer') print('end') #print('end_main') if args.pause: nsml.paused(scope=local)
def get_item(root, phase): #print('load') csv_file = os.path.join(root, 'test', 'test_data', 'test_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') #print('loaded!!') sparse_features = ['article_id', 'hh','gender','age_range','len_bin'] dense_features = ['image_feature', 'read_cnt_prob'] global lit_cnt_prob_list lit_cnt_prob_list = lit_cnt_prob_list.replace(' ','') lit_cnt_prob_list = lit_cnt_prob_list.replace('\n','') lit_cnt_prob = lit_cnt_prob_list.split(',') len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'],6,duplicates='drop') artics = item['article_id'].tolist() lit = list(set(artics)) lit.sort() print(f'len lit : {len(lit)}') #### fea #print('feature dict generate') #resnet_feature_extractor('test') with open(os.path.join('/data/airush2/test/test_data/test_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) print('image_feaeture_dict loaded..') print('check artic feature') print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}") lbe = LabelEncoder() lbe.fit(lit) item['article_id' + '_onehot'] = lbe.transform(item['article_id']) for feat in sparse_features[1:]: lbe = LabelEncoder() item[feat + '_onehot'] = lbe.fit_transform(item[feat]) #print('----- after onehot encoding -----') #print(item.head(10)) # test set으로 구성해도 되고 item 을.. fixlen_feature_columns = [SparseFeat('article_id',1896)] fixlen_feature_columns += [SparseFeat(feat, item[feat +'_onehot'].nunique()) for feat in sparse_features[1:]] fixlen_feature_columns += [DenseFeat('image_feature',len(image_feature_dict[artics[0]]))] fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)] #print(fixlen_feature_columns) idx_artics_all = item['article_id'].tolist() linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns) fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary') #bind_nsml(model, list(), args.task) return model, fixlen_feature_names_global, item,image_feature_dict, lit, lit_cnt_prob
def main(args): if args.arch == 'xDeepFM': s = time.time() csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data') item = pd.read_csv(csv_file, dtype={ 'article_id': str, 'hh': int, 'gender': str, 'age_range': str, 'read_article_ids': str }, sep='\t') label_data_path = os.path.join( DATASET_PATH, 'train', os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label') label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t') item['label'] = label sparse_features = [ 'article_id', 'hh', 'gender', 'age_range', 'len_bin' ] dense_features = ['image_feature'] target = ['label'] len_lis = [] read_article_ids_all = item['read_article_ids'].tolist() for i in range(len(item)): li = read_article_ids_all[i] if type(li) == float: len_lis.append(0) continue len_li = len(li.split(',')) len_lis.append(len_li) item['len'] = len_lis item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop') id_to_artic = dict() artics = item['article_id'].tolist() with open( os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle: image_feature_dict = pickle.load(handle) for feat in sparse_features: lbe = LabelEncoder() item[feat] = lbe.fit_transform(item[feat]) fixlen_feature_columns = [ SparseFeat(feat, item[feat].nunique()) for feat in sparse_features ] fixlen_feature_columns += [ DenseFeat(feat, len(image_feature_dict[artics[0]])) for feat in dense_features ] idx_artics_all = item['article_id'].tolist() for i in range(len(artics)): idx_artic = idx_artics_all[i] if idx_artic not in id_to_artic.keys(): id_to_artic[idx_artic] = artics[i] #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스 linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns fixlen_feature_names = get_fixlen_feature_names( linear_feature_columns + dnn_feature_columns) print(fixlen_feature_names) global fixlen_feature_names_global fixlen_feature_names_global = fixlen_feature_names model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='regression') print('---model defined---') # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까 print(time.time() - s, 'seconds') optimizer = tf.keras.optimizers.Adam(args.lr) s = time.time() # negative sampling item_pos = item[item['label'] == 1] item_neg = item[item['label'] == 0] dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42) dn_1.reset_index() data_1 = pd.concat([dn_1, item_pos]).sample(frac=1, random_state=42).reset_index() data_1_article_idxs = data_1['article_id'].tolist() li = [] for i in range(len(data_1_article_idxs)): image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]] li.append(image_feature) data_1['image_feature'] = li li = [] print(f'finished data_1_image_feature : {time.time() - s} sec') if use_nsml: bind_nsml(model, optimizer, args.task) if args.pause: nsml.paused(scope=locals()) if (args.mode == 'train') or args.dry_run: best_loss = 1000 if args.dry_run: print('start dry-running...!') args.num_epochs = 1 else: print('start training...!') # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네 model.compile( tf.keras.optimizers.Adam(args.lr), 'mse', metrics=['accuracy'], ) train_generator = data_generator(data_1) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler) save_cbk = CustomModelCheckpoint() history = model.fit_generator(train_generator, epochs=100, verbose=2, workers=8, steps_per_epoch=np.ceil( len(data_1) / 2048), callbacks=[lr_scheduler, save_cbk]) print('again')