Exemplo n.º 1
0
def get_item(root):
    print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature']
    target = ['label']

    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')

    id_to_artic = dict()
    artics = item['article_id'].tolist()
    
    with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)

    print('image_feaeture_dict loaded..')
    for feat in sparse_features:
        lbe = LabelEncoder()
        item[feat] = lbe.fit_transform(item[feat])

    # test set으로 구성해도 되고 item 을..
    fixlen_feature_columns = []
    for feat in sparse_features:
        if feat == 'article_id':
            fixlen_feature_columns.append(SparseFeat(feat,1896))
        else:
            fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique()))
    #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
    
    print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
    for i in range(len(artics)):
        idx_artic = idx_artics_all[i]
        if idx_artic not in id_to_artic.keys():
            id_to_artic[idx_artic] = artics[i]
    
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
Exemplo n.º 2
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)
Exemplo n.º 3
0
def main(args):
    if args.arch == 'MLP':
        model = get_mlp(num_classes=args.num_classes)
    elif args.arch == 'Resnet':
        model = get_resnet18(num_classes=args.num_classes)
    elif args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label
        print(len(item))
        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature']
        target = ['label']
        print(time.time() - s, 'seconds')
        s = time.time()
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        print(f'read_article_ids_all len : {len(read_article_ids_all)}')
        """
        def extract_len_read_article(read_article_ids):
            if type(read_article_ids) == float:
                return 0
            else :
                return len(read_article_ids.split(','))
        read_article_ids_all = item['read_article_ids'].tolist()
        with Pool(processes=6) as p:
            len_lis = list(tqdm(p.imap(extract_len_read_article, read_article_ids_all), total=len(read_article_ids_all)))
        """
        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')
        print('len_bin finished ', time.time() - s, 'seconds')
        id_to_artic = dict()
        artics = item['article_id'].tolist()

        with open(
                os.path.join(DATASET_PATH, 'train', 'train_data',
                             'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [
            SparseFeat(feat, item[feat].nunique()) for feat in sparse_features
        ]
        fixlen_feature_columns += [
            DenseFeat(feat, len(image_feature_dict[artics[0]]))
            for feat in dense_features
        ]
        print(artics[0])
        print(fixlen_feature_columns)
        """
        [SparseFeat(name='article_id', dimension=1896, use_hash=False, dtype='int32', embedding_name='article_id', embedding=True), SparseFeat(name='hh', dimension=24, use_hash=False, dtype='int32', embedding_name='hh', embedding=True), SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True), SparseFeat(name='age_range', dimension=9, use_hash=False, dtype='int32', embedding_name='age_range', embedding=True), SparseFeat(name='len_bin', dimension=5, use_hash=False, dtype='int32', embedding_name='len_bin', embedding=True), DenseFeat(name='image_feature', dimension=2048, dtype='float32')]
        
        """
        print('---fixlen_feature_columns finished---')
        s = time.time()
        idx_artics_all = item['article_id'].tolist()
        print(f'idx_artics_all len : {len(idx_artics_all)}')
        print(f'artics len : {len(artics)}')
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        print(f'id_to_artic len : {len(id_to_artic)}')
        print(time.time() - s, 'seconds')
        #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
    """
    if args.use_gpu:
        model = model.cuda()
    else:
        model = model.cpu()

    """
    optimizer = tf.keras.optimizers.Adam(args.lr)

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]
    print(f'len item_pos : {len(item_pos)}')
    print(f'len item_neg : {len(item_neg)}')

    dn_1 = item_neg.sample(n=2 * len(item_pos), random_state=42)
    dn_1.reset_index()
    print(f'len dn_1 : {len(dn_1)}')

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    print(f'len data_1 : {len(data_1)}')
    print('--- negative sampling completed ---')

    s = time.time()
    data_1_article_idxs = data_1['article_id'].tolist()
    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)

    print(f'len image_feature : {len(li)}')
    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    print(f'generate all x_train')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:

        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네
        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=200,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')
        """
Exemplo n.º 4
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':


        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        s = time.time()
        #print(f'before test article preprocess : {len(item)}')
        
        #print(f'after test  article preprocess : {len(item)}')
        #print(f'time : {time.time() - s}')

        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature', 'read_cnt_prob']
        target = ['label']
        
        ############################ make more feature !!!!!!! #################################
        ############## 1. read_article_ids len cnt -- user feature #################################################
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        

        #print(item.head(3))
        #print('columns name : ',item.columns)
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature', 'read_cnt_prob']
        
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat('image_feature',2048)]
        fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)]
        
        #print(f'fixlen_feature_columns : {fixlen_feature_columns}')
 
        
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'regression')
        print('---model defined---')
        #print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        #print('_infer root - : ', DATASET_PATH)
        #print('test')
        #print('DATASET_PATH: ', DATASET_PATH)
        file_list= glob.glob(f'{DATASET_PATH}/test/test_data/*')
        #print('file_list: ',file_list)
        model, fixlen_feature_names_global, item, image_feature_dict,lit,lit_cnt_prob = get_item(DATASET_PATH,args.mode)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['3','team_62/airush2/361']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        #print('successfully loaded')

    if (args.mode == 'train'):
        #print('DATASET_PATH: ', DATASET_PATH)
        #file_list= glob.glob(f'{DATASET_PATH}/train/train_data/*')
        #print('file_list :',file_list)
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    #print('end_main')

    if args.pause:
        nsml.paused(scope=local)
Exemplo n.º 5
0
def get_item(root, phase):
    #print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    #print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature', 'read_cnt_prob']
        

  
    global lit_cnt_prob_list
    lit_cnt_prob_list = lit_cnt_prob_list.replace(' ','')
    lit_cnt_prob_list = lit_cnt_prob_list.replace('\n','')
    lit_cnt_prob = lit_cnt_prob_list.split(',')


    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')


    artics = item['article_id'].tolist()
    lit = list(set(artics))
    lit.sort()
    print(f'len lit : {len(lit)}')
    #### fea
    #print('feature dict generate')
    #resnet_feature_extractor('test')

    with open(os.path.join('/data/airush2/test/test_data/test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)
    print('image_feaeture_dict loaded..')
    print('check artic feature')
    print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}")
    
    
    lbe = LabelEncoder()
    lbe.fit(lit)
    item['article_id' + '_onehot'] = lbe.transform(item['article_id'])

    for feat in sparse_features[1:]:
        lbe = LabelEncoder()
        item[feat + '_onehot'] = lbe.fit_transform(item[feat])

    
    #print('----- after onehot encoding -----')
    #print(item.head(10))
    # test set으로 구성해도 되고 item 을..

    fixlen_feature_columns = [SparseFeat('article_id',1896)]
    fixlen_feature_columns += [SparseFeat(feat, item[feat +'_onehot'].nunique()) for feat in sparse_features[1:]]
    fixlen_feature_columns += [DenseFeat('image_feature',len(image_feature_dict[artics[0]]))]
    fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)]
    
    #print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, lit, lit_cnt_prob
Exemplo n.º 6
0
def main(args):

    if args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label

        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature']
        target = ['label']

        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)

        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')

        id_to_artic = dict()
        artics = item['article_id'].tolist()

        with open(
                os.path.join(DATASET_PATH, 'train', 'train_data',
                             'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [
            SparseFeat(feat, item[feat].nunique()) for feat in sparse_features
        ]
        fixlen_feature_columns += [
            DenseFeat(feat, len(image_feature_dict[artics[0]]))
            for feat in dense_features
        ]

        idx_artics_all = item['article_id'].tolist()

        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]

            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='regression')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s, 'seconds')

    optimizer = tf.keras.optimizers.Adam(args.lr)
    s = time.time()

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]

    dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42)
    dn_1.reset_index()

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()

    data_1_article_idxs = data_1['article_id'].tolist()
    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)

    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:
        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네
        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=100,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')
Exemplo n.º 7
0
def main(args):

    if args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label
        s = time.time()
        print(f'before test article preprocess : {len(item)}')

        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature', 'read_cnt_prob']
        target = ['label']

        ############################ make more feature !!!!!!! #################################
        ############## 1. read_article_ids len cnt -- user feature #################################################
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)

        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')

        id_to_artic = dict()
        artics = item['article_id'].tolist()

        ################ 2. read_cnt, total_cnt, prob_read_cnt --- article feature ####################################
        read_cnt = item[item['label'] == 1].groupby('article_id').agg(
            {'hh': 'count'})
        read_cnt = read_cnt.reset_index()
        read_cnt = read_cnt.rename(columns={'hh': 'read_cnt'})

        read_cnt_list = read_cnt['read_cnt'].tolist()
        read_cnt_artic_list = read_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(read_cnt)}')
        print(read_cnt.head(3))

        total_cnt = item.groupby('article_id').agg({'hh': 'count'})
        total_cnt = total_cnt.reset_index()
        total_cnt = total_cnt.rename(columns={'hh': 'read_cnt'})
        total_cnt_list = total_cnt['read_cnt'].tolist()
        total_cnt_artic_list = total_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(total_cnt)}')
        print(total_cnt.head(3))

        # lit # test_article_ids list
        lit_cnt = []
        lit_total_cnt = []
        lit_cnt_prob = []
        lit = list(set(artics))
        lit.sort()
        print(lit[:10])
        print(f'len(lit):{len(lit)}')
        for i in range(len(lit)):
            # lit_cnt
            cur_artic = lit[i]
            if cur_artic not in read_cnt_artic_list:
                lit_cnt.append(0)
            else:
                for j in range(len(read_cnt_artic_list)):
                    if cur_artic == read_cnt_artic_list[j]:
                        lit_cnt.append(read_cnt_list[j])
                        break
            # lit_total_cnt
            if cur_artic not in total_cnt_artic_list:
                lit_total_cnt.append(0)
            else:
                for j in range(len(total_cnt_artic_list)):
                    if cur_artic == total_cnt_artic_list[j]:
                        lit_total_cnt.append(total_cnt_list[j])
                        break
            # lit_cnt_prob
            if lit_total_cnt[i] == 0:
                lit_cnt_prob.append(0)
            else:
                lit_cnt_prob.append(lit_cnt[i] / lit_total_cnt[i])
        print('--- read_cnt article feature completed ---')
        print(f'lit_cnt {len(lit_cnt)}')
        print(f'lit_total_cnt {len(lit_total_cnt)}')
        print(f'lit_cnt_prob {len(lit_cnt_prob)}')

        #### fea
        print('feature dict generate')
        file_list1 = os.listdir(DATASET_PATH)
        file_list2 = os.listdir(DATASET_PATH + '/train')
        file_list3 = os.listdir(DATASET_PATH + '/train/train_data')

        print(file_list1)
        print(file_list2)
        print(file_list3)
        resnet_feature_extractor(args.mode)

        print(file_list1)
        print(file_list2)
        print(file_list3)

        # One hot Encoding
        with open(os.path.join('train_image_features_50.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)

        print('check artic feature')
        print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}")

        lbe = LabelEncoder()
        lbe.fit(lit)
        item['article_id' + '_onehot'] = lbe.transform(item['article_id'])
        print(lbe.classes_)

        for feat in sparse_features[1:]:
            lbe = LabelEncoder()
            item[feat + '_onehot'] = lbe.fit_transform(
                item[feat])  # 이때 고친 라벨이 같은 라벨인지도 필수로 확인해야함

        print(item.head(10))
        print('columns name : ', item.columns)
        fixlen_feature_columns = [SparseFeat('article_id', len(lit))]
        fixlen_feature_columns += [
            SparseFeat(feat, item[feat + '_onehot'].nunique())
            for feat in sparse_features[1:]
        ]
        fixlen_feature_columns += [
            DenseFeat('image_feature', len(image_feature_dict[artics[0]]))
        ]
        fixlen_feature_columns += [DenseFeat('read_cnt_prob', 1)]

        print(f'fixlen_feature_columns : {fixlen_feature_columns}')
        idx_artics_all = item['article_id' + '_onehot'].tolist()

        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]

        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
        print('---model defined---')
        print(time.time() - s, 'seconds')

        ##### print need

        for artic in lit:
            print(artic, end=',')
        print()
        print('new')
        print()

        print(len(lit_cnt_prob))
        for prob in lit_cnt_prob:
            prob = round(prob, 4)
            print(prob, end=',')
        print()
        print('end')
        print('--------------')

    optimizer = tf.keras.optimizers.Adam(args.lr)
    s = time.time()

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]

    dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42)
    dn_2 = item_neg.sample(n=3 * len(item_pos), random_state=20)
    dn_3 = item_neg.sample(n=3 * len(item_pos), random_state=7)
    dn_4 = item_neg.sample(n=3 * len(item_pos), random_state=33)
    dn_5 = item_neg.sample(n=3 * len(item_pos), random_state=41)

    dn_1.reset_index()

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_1_article_idxs = data_1['article_id_onehot'].tolist()
    data_1_article = data_1['article_id'].tolist()
    print(f'len data_1 : {len(data_1)}')
    print(data_1.head(5))
    li1 = []
    li2 = []
    li3 = []
    for i in range(len(data_1_article)):
        for j in range(len(lit_cnt_prob)):
            if data_1_article[i] == lit[j]:
                li3.append(lit_cnt_prob[j])
                break
    data_1['read_cnt_prob'] = li3
    print('---read_cnt_prob end---')
    ## preprocess append

    data_2 = pd.concat([dn_2, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_3 = pd.concat([dn_3, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_4 = pd.concat([dn_4, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_5 = pd.concat([dn_5, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()

    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)
    print(f'article_id : {data_1_article[0]}')
    print(f'article_image_feature : {image_feature_dict[data_1_article[0]]}')

    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:
        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')

        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        #k_fold 할때는 check point 빼자
        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=100,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')