def main(): args = parse_args() torch.cuda.set_device(0) gpu_device = torch.device('cuda') output_directory = os.path.join(args.output, args.dataset, str(args.dim), '_'.join([args.model_name, str(args.batch_size)])) print(output_directory) if not os.path.exists(output_directory): os.makedirs(output_directory) out_log = os.path.join(output_directory, "train.log") sys.stdout = SimpleLogger(out_log, sys.stdout) # Select model model_factory = getattr(featurizer, args.model_name) model = model_factory(args.dim) # Setup train and eval transformations train_transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop(max(model.input_size)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), ToSpaceBGR(model.input_space == 'BGR'), ToRange255(max(model.input_range) == 255), transforms.Normalize(mean=model.mean, std=model.std) ]) eval_transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.CenterCrop(max(model.input_size)), transforms.ToTensor(), ToSpaceBGR(model.input_space == 'BGR'), ToRange255(max(model.input_range) == 255), transforms.Normalize(mean=model.mean, std=model.std) ]) # Setup dataset if args.dataset == 'StanfordOnlineProducts': train_dataset = StanfordOnlineProducts('/home/g1007540910/DATA/MetricLearning/stanford_products/Stanford_Online_Products', transform=train_transform) eval_dataset = StanfordOnlineProducts('/home/g1007540910/DATA/MetricLearning/stanford_products/Stanford_Online_Products', train=False, transform=eval_transform) elif args.dataset == 'Cars196': train_dataset = Cars196('/home/g1007540910/DATA/MetricLearning/cars196', transform=train_transform) eval_dataset = Cars196('/home/g1007540910/DATA/MetricLearning/cars196', train=False, transform=eval_transform) elif args.dataset == 'Cub200': train_dataset = Cub200('/home/g1007540910/DATA/MetricLearning/cub200/CUB_200_2011', transform=train_transform) eval_dataset = Cub200('/home/g1007540910/DATA/MetricLearning/cub200/CUB_200_2011', train=False, transform=eval_transform) elif args.dataset == "InShop": train_dataset = InShop('/home/g1007540910/DATA/MetricLearning/inshop', transform=train_transform) query_dataset = InShop('/home/g1007540910/DATA/MetricLearning/inshop', train=False, query=True, transform=eval_transform) index_dataset = InShop('/home/g1007540910/DATA/MetricLearning/inshop', train=False, query=False, transform=eval_transform) else: print("Dataset {} is not supported yet... Abort".format(args.dataset)) return # Setup dataset loader if args.class_balancing: print("Class Balancing") sampler = ClassBalancedBatchSampler(train_dataset.instance_labels, args.batch_size, args.images_per_class) train_loader = DataLoader(train_dataset, batch_sampler=sampler, num_workers=4, pin_memory=True, drop_last=False, collate_fn=default_collate) else: print("No class balancing") train_loader = DataLoader(train_dataset, batch_size=args.batch_size, drop_last=False, shuffle=True, pin_memory=True, num_workers=4) if args.dataset != "InShop": eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, pin_memory=True, num_workers=4) else: query_loader = DataLoader(query_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, pin_memory=True, num_workers=4) index_loader = DataLoader(index_dataset, batch_size=args.batch_size, drop_last=False, shuffle=False, pin_memory=True, num_workers=4) # Setup loss function loss_fn = losses.NormSoftmaxLoss(args.dim, train_dataset.num_instance) model.to(device=gpu_device) loss_fn.to(device=gpu_device) # Training mode model.train() # Start with pretraining where we finetune only new parameters to warm up opt = torch.optim.SGD(list(loss_fn.parameters()) + list(set(model.parameters()) - set(model.feature.parameters())), lr=args.lr * args.lr_mult, momentum=0.9, weight_decay=1e-4) log_every_n_step = 10 for epoch in range(args.pretrain_epochs): for i, (im, _, instance_label, index) in enumerate(train_loader): data = time.time() opt.zero_grad() im = im.to(device=gpu_device, non_blocking=True) instance_label = instance_label.to(device=gpu_device, non_blocking=True) forward = time.time() embedding = model(im) loss = loss_fn(embedding, instance_label) back = time.time() loss.backward() opt.step() end = time.time() if (i + 1) % log_every_n_step == 0: print('Epoch {}, LR {}, Iteration {} / {}:\t{}'.format( args.pretrain_epochs - epoch, opt.param_groups[0]['lr'], i, len(train_loader), loss.item())) print('Data: {}\tForward: {}\tBackward: {}\tBatch: {}'.format( forward - data, back - forward, end - back, end - forward)) eval_file = os.path.join(output_directory, 'epoch_{}'.format(args.pretrain_epochs - epoch)) if args.dataset != "InShop": embeddings, labels = extract_feature(model, eval_loader, gpu_device) evaluate_float_binary_embedding_faiss(embeddings, embeddings, labels, labels, eval_file, k=1000, gpu_id=0) else: query_embeddings, query_labels = extract_feature(model, query_loader, gpu_device) index_embeddings, index_labels = extract_feature(model, index_loader, gpu_device) evaluate_float_binary_embedding_faiss(query_embeddings, index_embeddings, query_labels, index_labels, eval_file, k=1000, gpu_id=0) # Full end-to-end finetune of all parameters opt = torch.optim.SGD(chain(model.parameters(), loss_fn.parameters()), lr=args.lr, momentum=0.9, weight_decay=1e-4) for epoch in range(args.epochs_per_step * args.num_steps): print('Output Directory: {}'.format(output_directory)) adjust_learning_rate(opt, epoch, args.epochs_per_step, gamma=args.gamma) for i, (im, _, instance_label, index) in enumerate(train_loader): data = time.time() opt.zero_grad() im = im.to(device=gpu_device, non_blocking=True) instance_label = instance_label.to(device=gpu_device, non_blocking=True) forward = time.time() embedding = model(im) loss = loss_fn(embedding, instance_label) back = time.time() loss.backward() opt.step() end = time.time() if (i + 1) % log_every_n_step == 0: print('Epoch {}, LR {}, Iteration {} / {}:\t{}'.format( epoch, opt.param_groups[0]['lr'], i, len(train_loader), loss.item())) print('Data: {}\tForward: {}\tBackward: {}\tBatch: {}'.format( forward - data, back - forward, end - back, end - data)) snapshot_path = os.path.join(output_directory, 'epoch_{}.pth'.format(epoch + 1)) torch.save(model.state_dict(), snapshot_path) if (epoch + 1) % args.test_every_n_epochs == 0: eval_file = os.path.join(output_directory, 'epoch_{}'.format(epoch + 1)) if args.dataset != "InShop": embeddings, labels = extract_feature(model, eval_loader, gpu_device) evaluate_float_binary_embedding_faiss(embeddings, embeddings, labels, labels, eval_file, k=1000, gpu_id=0) else: query_embeddings, query_labels = extract_feature(model, query_loader, gpu_device) index_embeddings, index_labels = extract_feature(model, index_loader, gpu_device) evaluate_float_binary_embedding_faiss(query_embeddings, index_embeddings, query_labels, index_labels, eval_file, k=1000, gpu_id=0)
def train_xgb_module(store_features=False, store_result=False, feature_select=False, num_round=300): if store_features is True: '''feature''' train_feature = extract_feature(train_agg, train_log) test_feature = extract_feature(test_agg, test_log) print('extract features successfully!') '''word2vec feature''' train_feature = train_feature.merge(extract_evt_lbl_features(train_log), on='USRID', how='left') test_feature = test_feature.merge(extract_evt_lbl_features(test_log), on='USRID', how='left') print('extract word2vec features successfully!') '''EVT_LBL one hot feature''' train_feature = train_feature.merge(extract_one_hot_feature(train_log), on='USRID', how='left') test_feature = test_feature.merge(extract_one_hot_feature(test_log), on='USRID', how='left') print('extract one hot features successfully!') '''EVT_LBL static feature''' train_feature = train_feature.merge(extract_evt_lbl_cnt_features(train_log), on='USRID', how='left') test_feature = test_feature.merge(extract_evt_lbl_cnt_features(test_log), on='USRID', how='left') print('extract EVT_LBL static features successfully!') '''EVT_LBL continue_cnt feature''' train_feature = train_feature.merge(calc_continue_evt_cnt(train_log), on='USRID', how='left') test_feature = test_feature.merge(calc_continue_evt_cnt(test_log), on='USRID', how='left') print('extract EVT_LBL continue_cnt features successfully!') '''duplicate_time feature''' train_feature = train_feature.merge(duplicate_time_different_max_cnt(train_log), on='USRID', how='left') test_feature = test_feature.merge(duplicate_time_different_max_cnt(test_log), on='USRID', how='left') print('extract duplicate_time features successfully!') '''store''' train_feature = train_feature.merge(train_flg, on='USRID', how='left') train_feature.to_csv(path + 'train_feature.csv', encoding='utf-8', index=None) test_feature.to_csv(path + 'test_feature.csv', encoding='utf-8', index=None) print('store features successfully!') # '''add cluster features''' # train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # train_cluster = pd.read_csv(path + 'train_cluster.csv', encoding='utf-8', low_memory=False) # test_cluster = pd.read_csv(path + 'test_cluster.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.merge(train_cluster, on='USRID', how='left') # test_feature = test_feature.merge(test_cluster, on='USRID', how='left') else: train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # '''cluster relative''' # train_feature = pd.read_csv(path + 'train_feature_filled.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature_filled.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.drop(['cluster_label', 'center_distance'], axis=1) # test_feature = test_feature.drop(['cluster_label', 'center_distance'], axis=1) print('read features successfully!') '''no log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].isnull()] # # pos_feature = train_feature[train_feature['FLAG'] == 1] # # neg_feature = train_feature[train_feature['FLAG'] == 0] # # '''instance sample''' # # neg_feature = neg_feature.sample(frac=0.098, replace=True, random_state=88) # # train_feature = pos_feature.append(neg_feature) # # '''shuffle rows''' # # index = [i for i in range(train_feature.shape[0])] # # random.shuffle(index) # # train_feature = train_feature.set_index([index]).sort_index() # # test_feature = test_feature[test_feature['evt_lbl_cnt'].isnull()] # names = ['V' + str(index) for index in range(1, 31, 1)] + ['USRID'] # train_feature = train_feature[names + ['FLAG']] # test_feature = test_feature[names] '''have log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].notnull()] # test_feature = test_feature[test_feature['evt_lbl_cnt'].notnull()] # train_feature = train_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) # test_feature = test_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) train_feature['word_distance'] = train_feature['word_distance'].map(lambda x: 1 if x >= 1 else 0) test_feature['word_distance'] = test_feature['word_distance'].map(lambda x: 1 if x >= 1 else 0) # train_feature.pop('word_distance') # test_feature.pop('word_distance') params = { 'booster': 'gbtree', 'max_depth': 5, 'colsample': 0.8, 'subsample': 0.8, 'eta': 0.03, 'silent': 1, 'objective': 'binary:logistic', ## binary:logistic ## 'eval_metric': 'auc', 'min_child_weight': 5, 'scale_pos_weight': 1, 'nthread': 6, # 'seed': 4396, } x_train, x_test, y_train, y_test = train_test_split(train_feature.drop(['USRID', 'FLAG'], axis=1), train_feature[['FLAG']], test_size=.2, random_state=88) if feature_select is True: # features_name = ['V1', 'V3', 'V6', 'V7', 'V9', 'V10', 'V11', 'V13', 'V15', 'V16', 'V19', 'V22', 'V23', 'V25', 'V27', 'V28', 'V29', 'V30', 'day_set_len', 'tch_typ_set_len', 'tch_typ0', 'tch_typ2', 'tch_typ0_rate', 'tch_typ2_rate', '1', '3', '6', '8', '9', '10', '13', '14', '18', '19', '21', '22', '23', '25', '26', '30', 'days_mean', 'days_min', 'days_max', 'days_var', 'days_median', 'days_day_var', 'days_day_median', 'days_day_skew', 'days_hour_mean', 'days_hour_min', 'days_hour_max', 'days_hour_skew', 'hour_max', 'hour_var', 'hour_skew', 'evt_lbl_cnt_max', 'first_of_max', 'first_of_min', 'first_of_median', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_median', 'first_max', 'first_min', 'second_min', 'three_max', 'three_median'] # features len:68 300:0.87087545758 400:0.87081954925 500:0.870075481655 # # features_name = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V12', 'V13', 'V16', 'V19', 'V21', 'V23', 'V24', 'V26', 'V28', 'V29', 'evt_lbl_cnt', 'evt_lbl_cnt_every_day', 'every_evt_lbl_cnt', 'tch_typ_set_len', 'tch_typ0', 'tch_typ0_rate', '2', '6', '7', '9', '10', '11', '12', '18', '20', '22', '24', '25', '28', 'days_mean', 'days_min', 'days_var', 'days_skew', 'continue_days', 'days_day_min', 'days_day_median', 'days_day_kurtosis', 'days_hour_max', 'days_hour_var', 'days_hour_kurtosis', 'hour_min', 'hour_max', 'hour_var', 'hour_median', 'evt_lbl_cnt_max', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_max', 'first_max', 'second_max', 'second_min', 'second_median', 'three_max', 'three_min'] features_name = ['V1', 'V3', 'V7', 'V9', 'V10', 'V12', 'V14', 'V16', 'V19', 'V22', 'V23', 'V24', 'V26', 'V28', 'V29', 'V30', 'evt_lbl_cnt', 'evt_lbl_cnt_every_day', 'evt_lbl_set_len_every_day', 'tch_typ_set_len', 'tch_typ0', 'tch_typ2', 'tch_typ_02', 'u1', 'u5', 'u6', 'u8', 'u9', 'u10', 'u11', 'u12', 'u13', 'u14', 'u15', 'u16', 'u17', 'u19', 'u20', 'u21', 'u23', 'u24', 'u26', 'u29', 'u30', 'days_mean', 'days_min', 'days_var', 'days_kurtosis', 'days_day_min', 'days_hour_mean', 'days_hour_max', 'days_hour_median', 'days_hour_skew', 'hour_max', 'hour_var', 'evt_lbl_cnt_median', 'first_of_min', 'first_of_median', 'second_of_max', 'second_of_median', 'three_of_max', 'three_of_min', 'three_of_median', 'evt_lbl_cnt_two_max', 'evt_lbl_cnt_two_var', 'evt_lbl_cnt_two_mode', 'evt0', 'evt2', 'evt3', 'evt4', 'evt5', 'evt8', 'evt10', 'evt12', 'evt14', 'evt17', 'evt18', 'evt19', 'evt21', 'evt23', 'evt24', 'evt25', 'evt27', 'evt29', 'evt30', 'evt33', 'evt36', 'evt37', 'evt38', 'evt41', 'evt45', 'evt47', 'evt48', 'evt49', 'evt51', 'evt52', 'evt54', 'evt55', 'evt59', 'evt62', 'evt63', 'evt64', 'evt68', 'evt69', 'evt70', 'evt71', 'evt72', 'evt73', 'evt76', 'evt77', 'evt78', 'evt80', 'evt81', 'evt83', 'evt88', 'evt90', 'evt91', 'evt92', 'evt96', 'evt98', 'evt100', 'evt101', 'evt102', 'evt103', 'evt108', 'evt109', 'evt110', 'evt111', 'evt112', 'evt116', 'evt117', 'evt119', 'evt120', 'evt121', 'evt125', 'evt128', 'evt130', 'evt132', 'evt135', 'evt137', 'evt138', 'evt139', 'evt142', 'evt143', 'evt145', 'evt150', 'evt151', 'evt152', 'evt154', 'evt155', 'evt156', 'evt159', 'evt160', 'evt162', 'evt163', 'evt166', 'evt168', 'evt169', 'evt171', 'evt172', 'evt173', 'evt174', 'evt175', 'evt176', 'evt177', 'evt179', 'evt182', 'evt183', 'evt185', 'evt186', 'evt190', 'evt191', 'evt192', 'evt198', 'evt199', 'evt200', 'evt201', 'evt202', 'evt203', 'evt204', 'evt206', 'evt208', 'evt209', 'evt211', 'evt212', 'evt213', 'evt215', 'evt216', 'evt217', 'evt220', 'evt223', 'evt224', 'evt227', 'evt229', 'evt232', 'evt235', 'evt236', 'evt237', 'evt238', 'evt239', 'evt240', 'evt241', 'evt243', 'evt244', 'evt248', 'evt249', 'evt251', 'evt254', 'evt257', 'evt258', 'evt261', 'evt267', 'evt268', 'evt269', 'evt271', 'evt273', 'evt277', 'evt279', 'evt282', 'evt284', 'evt288', 'evt291', 'evt296', 'evt300', 'evt302', 'evt309', 'evt311', 'evt312', 'evt313', 'evt315', 'evt316', 'evt319', 'evt320', 'evt323', 'evt325', 'evt327', 'evt331', 'evt332', 'evt333', 'evt334', 'evt335', 'evt341', 'evt342', 'evt345', 'evt346', 'evt347', 'evt348', 'evt349', 'evt351', 'evt354', 'evt355', 'evt356', 'evt357', 'evt358', 'evt360', 'evt363', 'evt364', 'evt367', 'evt368', 'evt369', 'evt370', 'evt374', 'evt375', 'evt376', 'evt377', 'evt378', 'evt380', 'evt381', 'evt382', 'evt385', 'evt390', 'evt394', 'evt395', 'evt396', 'evt397', 'evt400', 'evt401', 'evt402', 'evt403', 'evt404', 'evt405', 'evt408', 'evt411', 'evt413', 'evt414', 'evt416', 'evt421', 'evt422', 'evt425', 'evt427', 'evt428', 'evt433', 'evt436', 'evt440', 'evt441', 'evt442', 'evt443', 'evt444', 'evt445', 'evt447', 'evt449', 'evt450', 'evt451', 'evt453', 'evt454', 'evt455', 'evt457', 'evt459', 'evt462', 'evt463', 'evt464', 'evt465', 'evt466', 'evt469', 'evt471', 'evt472', 'evt473', 'evt475', 'evt478', 'evt482', 'evt484', 'evt485', 'evt489', 'evt492', 'evt500', 'evt501', 'evt503', 'evt504', 'evt507', 'evt508', 'evt511', 'evt512', 'evt513', 'evt515', 'evt520', 'evt524', 'evt525', 'evt526', 'evt528', 'evt529', 'evt531', 'evt540', 'evt541', 'evt544', 'evt546', 'evt548', 'evt549', 'evt550', 'evt552', 'evt553', 'evt554', 'evt561', 'evt562', 'evt564', 'evt566', 'evt567', 'evt568', 'evt569', 'evt572', 'evt574', 'evt575', 'evt578', 'evt580', 'evt583', 'evt584', 'evt585', 'evt588', 'evt592', 'evt593', 'evt594'] x_train = x_train[features_name] x_test = x_test[features_name] val_train = xgb.DMatrix(x_train, label=y_train) x_test = xgb.DMatrix(x_test) print('I\'m training validate module.') clf = xgb.train(params, val_train, num_round) score = roc_auc_score(y_test, clf.predict(x_test)) print('validate auc:', score) if store_result is True: '''label set''' train_label = train_feature[['FLAG']] '''pure feature''' train_feature = train_feature.drop(['USRID', 'FLAG'], axis=1) test_user = test_feature[['USRID']] test_feature = test_feature.drop(['USRID'], axis=1) if feature_select is True: train_feature = train_feature[features_name] test_feature = test_feature[features_name] train = xgb.DMatrix(train_feature, label=train_label) test_feature = xgb.DMatrix(test_feature) print('I\'m training final module.') module_two = xgb.train(params, train, num_round) result = module_two.predict(test_feature) pd.set_option('chained', None) # remove warning # test_user['RST'] = [index for index in result] print(test_user) '''store result''' time_string = time.strftime('_%Y%m%d%H%M%S', time.localtime(time.time())) file_name = 'result_b' + time_string + '.csv' test_user.to_csv(path + file_name, index=None, encoding='utf-8', sep='\t') print('result stored successfully!') print('program is over!')
def auto_predict(wav_file): result = ['female', 'male'] X_pred = extract_feature(wav_file) X_pred = X_pred.reshape(-1, 13) y_pred = knn_model().predict(X_pred) return result[int(y_pred)]
def train_xgb_module(store_features=False, store_result=False, feature_select=False, num_round=300): if store_features is True: '''feature''' train_feature = extract_feature(train_agg, train_log) test_feature = extract_feature(test_agg, test_log) print('extract features successfully!') # '''word2vec feature''' # train_feature = train_feature.merge(extract_evt_lbl_features(train_log), on='USRID', how='left') # test_feature = test_feature.merge(extract_evt_lbl_features(test_log), on='USRID', how='left') # print('extract word2vec features successfully!') '''EVT_LBL one hot feature''' train_feature = train_feature.merge(extract_one_hot_feature(train_log), on='USRID', how='left') test_feature = test_feature.merge(extract_one_hot_feature(test_log), on='USRID', how='left') print('extract one hot features successfully!') '''store''' train_feature = train_feature.merge(train_flg, on='USRID', how='left') train_feature.to_csv(path + 'train_feature.csv', encoding='utf-8', index=None) test_feature.to_csv(path + 'test_feature.csv', encoding='utf-8', index=None) print('store features successfully!') # '''add cluster features''' # train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # train_cluster = pd.read_csv(path + 'train_cluster.csv', encoding='utf-8', low_memory=False) # test_cluster = pd.read_csv(path + 'test_cluster.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.merge(train_cluster, on='USRID', how='left') # test_feature = test_feature.merge(test_cluster, on='USRID', how='left') else: train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # '''cluster relative''' # train_feature = pd.read_csv(path + 'train_feature_filled.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature_filled.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.drop(['cluster_label', 'center_distance'], axis=1) # test_feature = test_feature.drop(['cluster_label', 'center_distance'], axis=1) print('read features successfully!') '''no log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].isnull()] # # pos_feature = train_feature[train_feature['FLAG'] == 1] # # neg_feature = train_feature[train_feature['FLAG'] == 0] # # '''instance sample''' # # neg_feature = neg_feature.sample(frac=0.098, replace=True, random_state=88) # # train_feature = pos_feature.append(neg_feature) # # '''shuffle rows''' # # index = [i for i in range(train_feature.shape[0])] # # random.shuffle(index) # # train_feature = train_feature.set_index([index]).sort_index() # # test_feature = test_feature[test_feature['evt_lbl_cnt'].isnull()] # names = ['V' + str(index) for index in range(1, 31, 1)] + ['USRID'] # train_feature = train_feature[names + ['FLAG']] # test_feature = test_feature[names] '''have log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].notnull()] # test_feature = test_feature[test_feature['evt_lbl_cnt'].notnull()] # train_feature = train_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) # test_feature = test_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) '''fill nan with 0''' train_feature = train_feature.fillna(0) test_feature = test_feature.fillna(0) x_train, x_test, y_train, y_test = train_test_split( train_feature.drop(['USRID', 'FLAG'], axis=1), train_feature[['FLAG']], test_size=.2, random_state=88) if feature_select is True: # features_name = ['V1', 'V3', 'V6', 'V7', 'V9', 'V10', 'V11', 'V13', 'V15', 'V16', 'V19', 'V22', 'V23', 'V25', 'V27', 'V28', 'V29', 'V30', 'day_set_len', 'tch_typ_set_len', 'tch_typ0', 'tch_typ2', 'tch_typ0_rate', 'tch_typ2_rate', '1', '3', '6', '8', '9', '10', '13', '14', '18', '19', '21', '22', '23', '25', '26', '30', 'days_mean', 'days_min', 'days_max', 'days_var', 'days_median', 'days_day_var', 'days_day_median', 'days_day_skew', 'days_hour_mean', 'days_hour_min', 'days_hour_max', 'days_hour_skew', 'hour_max', 'hour_var', 'hour_skew', 'evt_lbl_cnt_max', 'first_of_max', 'first_of_min', 'first_of_median', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_median', 'first_max', 'first_min', 'second_min', 'three_max', 'three_median'] # features len:68 300:0.87087545758 400:0.87081954925 500:0.870075481655 # features_name = [ 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V12', 'V13', 'V16', 'V19', 'V21', 'V23', 'V24', 'V26', 'V28', 'V29', 'evt_lbl_cnt', 'evt_lbl_cnt_every_day', 'every_evt_lbl_cnt', 'tch_typ_set_len', 'tch_typ0', 'tch_typ0_rate', '2', '6', '7', '9', '10', '11', '12', '18', '20', '22', '24', '25', '28', 'days_mean', 'days_min', 'days_var', 'days_skew', 'continue_days', 'days_day_min', 'days_day_median', 'days_day_kurtosis', 'days_hour_max', 'days_hour_var', 'days_hour_kurtosis', 'hour_min', 'hour_max', 'hour_var', 'hour_median', 'evt_lbl_cnt_max', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_max', 'first_max', 'second_max', 'second_min', 'second_median', 'three_max', 'three_min' ] x_train = x_train[features_name] x_test = x_test[features_name] print('I\'m training validate module.') module = GradientBoostingClassifier( n_estimators=num_round, learning_rate=0.05, random_state=2018, max_depth=5, subsample=0.7, ) module.fit(x_train, y_train['FLAG'].ravel()) result = module.predict_proba(x_test)[:, 1] score = roc_auc_score(y_test, result) print('validate auc:', score) if store_result is True: '''label set''' train_label = train_feature[['FLAG']] '''pure feature''' train_feature = train_feature.drop(['USRID', 'FLAG'], axis=1) test_user = test_feature[['USRID']] test_feature = test_feature.drop(['USRID'], axis=1) if feature_select is True: train_feature = train_feature[features_name] test_feature = test_feature[features_name] print('I\'m training final module.') module_two = GradientBoostingClassifier( n_estimators=num_round, learning_rate=0.05, random_state=2018, max_depth=5, subsample=0.7, ) module_two.fit(train_feature, train_label['FLAG'].ravel()) result = module_two.predict_proba(test_feature)[:, 1] pd.set_option('chained', None) # remove warning # test_user['RST'] = [index for index in result] print(test_user) '''store result''' time_string = time.strftime('_%Y%m%d%H%M%S', time.localtime(time.time())) file_name = 'result_b' + time_string + '.csv' test_user.to_csv(path + file_name, index=None, encoding='utf-8', sep='\t') print('result stored successfully!') print('program is over!')
def feature(): print("extracting features from draw image...") extract_feature() print("done") print()
answer = get_answer(question) print("Question is :" + str(question)) print("Answer is :" + str(answer)) print("done") print() return jsonify(answer) if __name__ == '__main__': print("== Test functions") print() print("=== Test caption...") print("caption:" + str(get_caption())) print("done") print() print("=== Test feature_extracting...") extract_feature() print("done") print() print("=== Test vqa...") print("vqa:" + str(get_answer(question="How many cars are there?"))) print("done") print() app.debug = False # app.run(host='124.70.139.138', port=5000) app.run(host='222.19.197.230', port=5000) #实际用的GPU代码与此模型代码有区别,应用还需修改。
def train_xgb_module(store_features=False, store_result=False, feature_select=False, num_round=300): if store_features is True: '''feature''' train_feature = extract_feature(train_agg, train_log) test_feature = extract_feature(test_agg, test_log) print('extract features successfully!') # '''word2vec feature''' # train_feature = train_feature.merge(extract_evt_lbl_features(train_log), on='USRID', how='left') # test_feature = test_feature.merge(extract_evt_lbl_features(test_log), on='USRID', how='left') print('extract word2vec features successfully!') '''EVT_LBL one hot feature''' train_feature = train_feature.merge(extract_one_hot_feature(train_log), on='USRID', how='left') test_feature = test_feature.merge(extract_one_hot_feature(test_log), on='USRID', how='left') print('extract one hot features successfully!') '''EVT_LBL static feature''' train_feature = train_feature.merge( extract_evt_lbl_cnt_features(train_log), on='USRID', how='left') test_feature = test_feature.merge( extract_evt_lbl_cnt_features(test_log), on='USRID', how='left') print('extract EVT_LBL static features successfully!') # '''EVT_LBL continue_cnt feature''' # train_feature = train_feature.merge(calc_continue_evt_cnt(train_log), on='USRID', how='left') # test_feature = test_feature.merge(calc_continue_evt_cnt(test_log), on='USRID', how='left') # print('extract EVT_LBL continue_cnt features successfully!') '''store''' train_feature = train_feature.merge(train_flg, on='USRID', how='left') train_feature.to_csv(path + 'train_feature.csv', encoding='utf-8', index=None) test_feature.to_csv(path + 'test_feature.csv', encoding='utf-8', index=None) print('store features successfully!') # '''add cluster features''' # train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # train_cluster = pd.read_csv(path + 'train_cluster.csv', encoding='utf-8', low_memory=False) # test_cluster = pd.read_csv(path + 'test_cluster.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.merge(train_cluster, on='USRID', how='left') # test_feature = test_feature.merge(test_cluster, on='USRID', how='left') else: train_feature = pd.read_csv(path + 'train_feature.csv', encoding='utf-8', low_memory=False) test_feature = pd.read_csv(path + 'test_feature.csv', encoding='utf-8', low_memory=False) # '''cluster relative''' # train_feature = pd.read_csv(path + 'train_feature_filled.csv', encoding='utf-8', low_memory=False) # test_feature = pd.read_csv(path + 'test_feature_filled.csv', encoding='utf-8', low_memory=False) # train_feature = train_feature.drop(['cluster_label', 'center_distance'], axis=1) # test_feature = test_feature.drop(['cluster_label', 'center_distance'], axis=1) print('read features successfully!') '''no log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].isnull()] # # pos_feature = train_feature[train_feature['FLAG'] == 1] # # neg_feature = train_feature[train_feature['FLAG'] == 0] # # '''instance sample''' # # neg_feature = neg_feature.sample(frac=0.098, replace=True, random_state=88) # # train_feature = pos_feature.append(neg_feature) # # '''shuffle rows''' # # index = [i for i in range(train_feature.shape[0])] # # random.shuffle(index) # # train_feature = train_feature.set_index([index]).sort_index() # # test_feature = test_feature[test_feature['evt_lbl_cnt'].isnull()] # names = ['V' + str(index) for index in range(1, 31, 1)] + ['USRID'] # train_feature = train_feature[names + ['FLAG']] # test_feature = test_feature[names] '''have log table''' # train_feature = train_feature[train_feature['evt_lbl_cnt'].notnull()] # test_feature = test_feature[test_feature['evt_lbl_cnt'].notnull()] # train_feature = train_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) # test_feature = test_feature.drop(['first_len_rank', 'second_len_rank', 'three_len_rank', 'evt_lbl_cnt_len_rank', 'evt_lbl_cnt_len_reverse', 'first_len_rank_reverse', 'second_len_rank_reverse', 'three_len_rank_reverse'], axis=1) train_feature['word_distance'] = train_feature['word_distance'].map( lambda x: 1 if x >= 1 else 0) test_feature['word_distance'] = test_feature['word_distance'].map( lambda x: 1 if x >= 1 else 0) # train_feature.pop('word_distance') # test_feature.pop('word_distance') params = { 'boosting': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'train_metric': True, 'subsample': 0.8, 'learning_rate': 0.03, 'num_leaves': 96, 'num_threads': 6, 'max_depth': 5, 'colsample_bytree': 0.8, 'lambda_l2': 0.01, 'verbose': -1, # 'feature_fraction': 0.9, # 'bagging_fraction': 0.95, } x_train, x_test, y_train, y_test = train_test_split( train_feature.drop(['USRID', 'FLAG'], axis=1), train_feature[['FLAG']], test_size=.2, random_state=88) if feature_select is True: # features_name = ['V1', 'V3', 'V6', 'V7', 'V9', 'V10', 'V11', 'V13', 'V15', 'V16', 'V19', 'V22', 'V23', 'V25', 'V27', 'V28', 'V29', 'V30', 'day_set_len', 'tch_typ_set_len', 'tch_typ0', 'tch_typ2', 'tch_typ0_rate', 'tch_typ2_rate', '1', '3', '6', '8', '9', '10', '13', '14', '18', '19', '21', '22', '23', '25', '26', '30', 'days_mean', 'days_min', 'days_max', 'days_var', 'days_median', 'days_day_var', 'days_day_median', 'days_day_skew', 'days_hour_mean', 'days_hour_min', 'days_hour_max', 'days_hour_skew', 'hour_max', 'hour_var', 'hour_skew', 'evt_lbl_cnt_max', 'first_of_max', 'first_of_min', 'first_of_median', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_median', 'first_max', 'first_min', 'second_min', 'three_max', 'three_median'] # features len:68 300:0.87087545758 400:0.87081954925 500:0.870075481655 # features_name = [ 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V12', 'V13', 'V16', 'V19', 'V21', 'V23', 'V24', 'V26', 'V28', 'V29', 'evt_lbl_cnt', 'evt_lbl_cnt_every_day', 'every_evt_lbl_cnt', 'tch_typ_set_len', 'tch_typ0', 'tch_typ0_rate', '2', '6', '7', '9', '10', '11', '12', '18', '20', '22', '24', '25', '28', 'days_mean', 'days_min', 'days_var', 'days_skew', 'continue_days', 'days_day_min', 'days_day_median', 'days_day_kurtosis', 'days_hour_max', 'days_hour_var', 'days_hour_kurtosis', 'hour_min', 'hour_max', 'hour_var', 'hour_median', 'evt_lbl_cnt_max', 'second_of_max', 'second_of_min', 'second_of_median', 'three_of_max', 'first_max', 'second_max', 'second_min', 'second_median', 'three_max', 'three_min' ] x_train = x_train[features_name] x_test = x_test[features_name] validate_label = np.array(y_train['FLAG'], dtype=np.int8) val_train = lgb.Dataset(x_train, label=validate_label) validate = lgb.train(params=params, train_set=val_train, num_boost_round=num_round) score = roc_auc_score(y_test, validate.predict(x_test)) print('validate auc:', score) if store_result is True: '''label set''' train_label = train_feature[['FLAG']] '''pure feature''' train_feature = train_feature.drop(['USRID', 'FLAG'], axis=1) test_user = test_feature[['USRID']] test_feature = test_feature.drop(['USRID'], axis=1) if feature_select is True: train_feature = train_feature[features_name] test_feature = test_feature[features_name] train_label = np.array(train_label['FLAG'], dtype=np.int8) train = lgb.Dataset(train_feature, label=train_label) model_two = lgb.train(params=params, train_set=train, num_boost_round=num_round) result = model_two.predict(test_feature) pd.set_option('chained', None) # remove warning # test_user['RST'] = [index for index in result] print(test_user) '''store result''' time_string = time.strftime('_%Y%m%d%H%M%S', time.localtime(time.time())) file_name = 'result_b' + time_string + '.csv' test_user.to_csv(path + file_name, index=None, encoding='utf-8', sep='\t') print('result stored successfully!') print('program is over!')