Пример #1
0
def find_expert(tag):
    """
    输出话题标签[TAG]下模型预测的最有可能是潜在专家的20名用户
    """
    fold = StratifiedKFold(n_splits=4)
    params = best_solution(tag)
    data, target, ratio = load_data(tag)
    fold.random_state = int(params['seed'])
    samp = ADASYN(n_neighbors=2,
                  sampling_strategy=float(params['sampling_strategy']) * ratio,
                  random_state=int(params['seed']))
    clf = XGBClassifier(n_estimators=int(params['n_estimators']),
                        gamma=float(params['gamma']),
                        eta=float(params['eta']),
                        reg_lambda=int(params['reg_lambda']),
                        verbosity=0,
                        n_jobs=-1,
                        random_state=int(params['seed']))
    pipeline = Pipeline([(type(samp).__name__, samp),
                         (type(clf).__name__, clf)])
    experts = pd.DataFrame(columns=['id', 'probability'])
    for _, (train, test) in tqdm(enumerate(fold.split(data, target)), total=4):
        pipeline.fit(data.iloc[train], target.iloc[train])
        pred_proba = pd.Series(pipeline.predict_proba(data.iloc[test])[:, 1],
                               index=target.iloc[test].index,
                               name='probability')
        experts = experts.append(pred_proba.to_frame().reset_index())
    experts = experts.sort_values(by=['probability'],
                                  ascending=False).iloc[:20]
    experts['probability'] = experts['probability'].astype(float).map(
        "{:.1%}".format)
    print(experts.to_string(index=False))
Пример #2
0
def _split_weighted_sample(self, X, y, sample_weight, is_stratified=False):
    if is_stratified:
        kfold_model = StratifiedKFold(n_splits=self.n_splits,
                                      shuffle=self.shuffle,
                                      random_state=self.random_state)
    else:
        kfold_model = KFold(n_splits=self.n_splits,
                            shuffle=self.shuffle,
                            random_state=self.random_state)
    if sample_weight is None:
        return kfold_model.split(X, y)
    weights_sum = np.sum(sample_weight)
    max_deviations = []
    all_splits = []
    for i in range(self.n_trials + 1):
        splits = [test for (train, test) in list(kfold_model.split(X, y))]
        weight_fracs = np.array(
            [np.sum(sample_weight[split]) / weights_sum for split in splits])
        if np.all(weight_fracs > .95 / self.n_splits):
            # Found a good split, return.
            return self._get_folds_from_splits(splits, X.shape[0])
        # Record all splits in case the stratification by weight yeilds a worse partition
        all_splits.append(splits)
        max_deviation = np.max(np.abs(weight_fracs - 1 / self.n_splits))
        max_deviations.append(max_deviation)
        # Reseed random generator and try again
        kfold_model.shuffle = True
        kfold_model.random_state = None

    # If KFold fails after n_trials, we try the next best thing: stratifying by weight groups
    warnings.warn(
        "The KFold algorithm failed to find a weight-balanced partition after "
        +
        "{n_trials} trials. Falling back on a weight stratification algorithm."
        .format(n_trials=self.n_trials), UserWarning)
    if is_stratified:
        stratified_weight_splits = [[]] * self.n_splits
        for y_unique in np.unique(y.flatten()):
            class_inds = np.argwhere(y == y_unique).flatten()
            class_splits = self._get_splits_from_weight_stratification(
                sample_weight[class_inds])
            stratified_weight_splits = [
                split + list(class_inds[class_split]) for split, class_split in
                zip(stratified_weight_splits, class_splits)
            ]
    else:
        stratified_weight_splits = self._get_splits_from_weight_stratification(
            sample_weight)
    weight_fracs = np.array([
        np.sum(sample_weight[split]) / weights_sum
        for split in stratified_weight_splits
    ])
    if np.all(weight_fracs > .95 / self.n_splits):
        # Found a good split, return.
        return self._get_folds_from_splits(stratified_weight_splits,
                                           X.shape[0])
    else:
        # Did not find a good split
        # Record the devaiation for the weight-stratified split to compare with KFold splits
        all_splits.append(stratified_weight_splits)
        max_deviation = np.max(np.abs(weight_fracs - 1 / self.n_splits))
        max_deviations.append(max_deviation)
    # Return most weight-balanced partition
    min_deviation_index = np.argmin(max_deviations)
    return self._get_folds_from_splits(all_splits[min_deviation_index],
                                       X.shape[0])
Пример #3
0
def train_func(train_path):
    # 请填写测试代码
    test = pd.read_csv('../data/test_1.csv')
    # 选手不得改变格式,测试代码跑不通分数以零算

    # #####选手填写测试集处理逻辑,在指定文件夹下生成可提交的csv文件
    def f1_score(y,pred):
        P = precision_score(y,pred)
        R = recall_score(y,pred)
        return 4*P*R/(P+3*R)
    
    def find_threshold(oof_pred,y,left=0,right=1,display=False,verbose=True):
        oof_temp = oof_pred.copy()
        plt_ = pd.DataFrame()
        best_threshold=0
        best_f1 = 0
        best_num = 0
        for n,i in enumerate(np.linspace(left,right,66)):
            oof_temp[oof_pred>=i]=1
            oof_temp[oof_pred<i]=0
            f1_ = f1_score(y,oof_temp)
            plt_.loc[n,"num"] = i
            plt_.loc[n,"f1"] = f1_

            if best_f1<f1_:
                best_f1 = f1_
                best_threshold = i
                best_num = len(oof_temp[oof_pred>=i])
                if verbose:
                    print(f"threshold =={i}, f1 score: {f1_}")
        if display:
            plt.plot(plt_['num'],plt_['f1'])
            plt.title('f1_score_with_threshold')
        return best_threshold,best_f1,best_num
    

    train = pd.read_csv(train_path)

    train['is_train'] = 1
    test['is_train'] = 0
    data = train.append(test).reset_index(drop=True)
    data['tlsIssuerDn_null'] = data['tlsIssuerDn'].apply(lambda x:0 if str(x)=='nan' else 1)

    split_col = []
    data['tlsSubject'] = data['tlsSubject'].astype(str).apply(lambda x:x.replace('/',','))

    for string in ['C','ST','L','O','OU','CN']:
        data['tlsSubject_'+string] = data['tlsSubject'].apply(lambda x:''.join([i for i in x.split(',') if string+'=' in i]))
        data['tlsSubject_'+string] = data['tlsSubject_'+string].apply(lambda x:x.split('=')[1] if len(x.split('='))>1 else 'unk')
        split_col.append('tlsSubject_'+string)

    if os.path.exists('cnt_code_dict.pkl'):
        print('baocun')
        for i in split_col+['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','tlsVersion',
                  'destPort', 'bytesOut','bytesIn', 'pktsIn', 'pktsOut']:
            cnt_dic = load_feature('cnt_code_dict.pkl')
            data[i+'_cnt'] = data[i].map(cnt_dic[i])
    else:
        cnt_dic = {}
        for i in split_col+['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','tlsVersion',
                  'destPort', 'bytesOut','bytesIn', 'pktsIn', 'pktsOut']:
            if i in split_col:
                cnt_dic[i] = data[data['is_train']==1][i].value_counts().to_dict()
            else:
                cnt_dic[i] = train[i].value_counts().to_dict()
            data[i+'_cnt'] = data[i].map(cnt_dic[i])
        save_feature(cnt_dic,'cnt_code_dict.pkl')


    data['bytesOut_pktsIn'] = data['bytesOut'] / data['pktsIn']
    data['bytesIn_pktsOut'] = data['bytesIn'] / data['pktsOut']
    data['bytesIn_bytesOut'] = data['bytesIn'] / data['bytesOut']
    data['pktsIn_pktsOut'] = data['pktsIn'] / data['pktsOut']
    

    data['tlsVersion_num'] = data['tlsVersion'].apply(lambda x:re.findall(r"\d+\.?\d*",x)[0] if len(re.findall(r"\d+\.?\d*",x))==1 else np.nan).astype(float)

    for col in ['tlsSubject_C_cnt', 'tlsSubject_ST_cnt',
     'tlsSubject_L_cnt', 'tlsSubject_O_cnt', 'tlsSubject_OU_cnt',
     'tlsSubject_CN_cnt', 
     'tlsSubject_cnt', 'tlsIssuerDn_cnt', 'tlsSni_cnt',
     'srcAddress_cnt', 'destAddress_cnt', 'tlsVersion_cnt',
     'destPort_cnt', 'bytesOut_cnt', 'bytesIn_cnt',
     'pktsIn_cnt', 'pktsOut_cnt']:
        data[col] = data[col].apply(lambda x:np.nan if x<3 else x)
        
    #w2v
    data['add'] = (data['srcAddress'] + '.' + data['destAddress']).apply(lambda x:x.replace('.',' '))
    tf_df = get_w2v(data, 'add', 8,'vec')
    data = data.merge(tf_df,on='eventId',how='left')
    del data['add']
    
    #target_encoder
    for i in ['tlsSubject', 'tlsIssuerDn']:
        data[i+'_num'] = data[i].fillna('').apply(lambda x:len(str(x).split(',')))
#     for i in ['tlsSubject', 'tlsIssuerDn', 'tlsSni','srcAddress','destAddress']:
#         data[i+'_num'] = data[i].fillna('').apply(lambda x:len(x))
    for i in ['srcAddress','destAddress']:
        data[i+'_mean'] = data[i].apply(lambda x:np.mean([int(i) for i in x.split('.')]))
        data[i+'_std'] = data[i].apply(lambda x:np.std([int(i) for i in x.split('.')]))
        data[i+'_max'] = data[i].apply(lambda x:np.max([int(i) for i in x.split('.')]))
        data[i+'_min'] = data[i].apply(lambda x:np.min([int(i) for i in x.split('.')]))
    
    #training
    del_col=['tlsSubject','tlsIssuerDn','tlsSni','srcAddress','destAddress','appProtocol', 'tlsVersion',
             'tlsSubject_C', 'tlsSubject_ST', 'tlsSubject_L', 'tlsSubject_OU','tlsSubject_O',
     'tlsSubject_CN']

    train = data[data['is_train'] == 1].reset_index(drop=True)
    test = data[data['is_train'] == 0].reset_index(drop=True)
    
#     target_col = ['destPort', 'appProtocol', 'tlsIssuerDn', 'tlsVersion', 'pktsIn', 'pktsOut',
#                   'tlsSubject_ST', 'tlsSubject_L', 'tlsSubject_OU']
#     train,test = kfold_stats_feature(train,test,target_col,5)
    col=[i for i in train.columns if i not in ['eventId', 'label', 'is_train']+del_col]
    
    X_train=train[col].copy()
    y_train=train['label'].copy().astype(int)
    X_test=test[col].copy()
    print(X_train.shape,X_test.shape)

    lgb_params = {
                            'boosting_type': 'gbdt',
                            'objective': 'binary',
    #                         'metric': 'auc',
                            'num_leaves': 31,
                            'subsample': 0.8,
                            'max_depth':-1,
                            'colsample_bytree': 0.8,
                            'learning_rate': 0.05,
    #                         'bagging_freq':3,
                            'lambda_l2':2,
                            'seed': 1126,
                            'nthread': 8,

                 }

    K =5
    seed = 2021
    skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
    lgb_models=[]
    oof = np.zeros(len(X_train))
    predictions = np.zeros(len(X_test))
    auc_score = []
    # seeds = [2019]
    seeds = [2019]#,1111,1234
    for j,seed in enumerate(seeds):
                # change seed
        skf.random_state = seed
        lgb_params["seed"] = seed
        print(j,skf.random_state,lgb_params["seed"])
        for i, (train_index, val_index) in enumerate(skf.split(X_train,y_train)):
            print("fold {}".format(i))
            X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
            y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

            lgb_train = lgb.Dataset(X_tr,y_tr)
            lgb_val = lgb.Dataset(X_val,y_val)
            num_round = 30000
            if os.path.exists('lgb_'+str(seed)+'_'+str(i)+'.txt'):
                clf = lgb.Booster(model_file='lgb_{}_{}.txt'.format(seed,i))
                print(i)
            else:
                clf = lgb.train(lgb_params, lgb_train, num_round, valid_sets = [lgb_train, lgb_val],
                                verbose_eval=100, early_stopping_rounds = 60, 
                            #    categorical_feature=cate_feat
                               )#50
                clf.save_model('lgb_{}_{}.txt'.format(seed,i))
    # lgb_models.append(clf)
            oof[val_index] += clf.predict(X_val, num_iteration=clf.best_iteration)/len(seeds)
            pred = clf.predict(X_val, num_iteration=clf.best_iteration)
            auc_ss = roc_auc_score(y_val, pred)
            auc_score.append(auc_ss)
            print('auc = ', auc_ss)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / (skf.n_splits*len(seeds))
    print('auc score : ', np.mean(auc_score), np.std(auc_score))

    best_threshold,best_f1,best_num = find_threshold(oof,y_train,0.1,0.9,display=True,verbose=True)

    sub=test[['eventId']]
    sub['label']=[1 if x >= best_threshold else 0 for x in predictions]
# sub['label']=predictions
#     sub = sub.sort_values('label', ascending=False).reset_index()
#     sub.loc[:9000, 'label'] = 1
#     sub.loc[9000:, 'label'] = 0
    # sub['label'] = sub['label'].astype(int)


    # demo#
    # submission = test[['eventId']]
    # submission['label'] = 0
    sub.to_csv(save_path + 'FastCloud_finalA.csv',index = False,encoding='utf-8')
    print(best_threshold)