from set_seed import random_ctl seed = random_ctl(460304) #best seed from 20 seed search without mixup from fastai.text import * from fastai.callbacks import SaveModelCallback from fastai.layers import LabelSmoothingCrossEntropy import sentencepiece as spm #https://github.com/google/sentencepiece import fire from sp_tok import * from nlp_mixup import * from sklearn.model_selection import KFold def split_data_by_idx(all_texts_df: DataFrame, train_idx, valid_idx): df_train = all_texts_df.iloc[train_idx, :] df_valid = all_texts_df.iloc[valid_idx, :] return df_train, df_valid def fit_regr(model_path: str, sp_model: str, wd: float = 0., mixup: bool = True, qrnn: bool = True, n_hid: int = 2304, load_enc: str = None, split_seed: int = None): PATH = Path(model_path)
from set_seed import random_ctl seed = random_ctl() from fastai.text import * from fastai.callbacks import SaveModelCallback from fastai.layers import LabelSmoothingCrossEntropy import sentencepiece as spm #https://github.com/google/sentencepiece import fire from sp_tok import * from sklearn.model_selection import KFold def split_data_by_idx(all_texts_df:DataFrame, train_idx, valid_idx): df_train = all_texts_df.iloc[train_idx,:] df_valid = all_texts_df.iloc[valid_idx,:] return df_train, df_valid def fit_clas(model_path:str, sp_model:str, wd:float=0., qrnn:bool=True, n_hid:int=2304, load_enc:str=None, split_seed:int=None): PATH = Path(model_path) # torch.backends.cudnn.enabled=False defaults.text_spec_tok.append(NL) #add a New Line special char sp_vocab = Vocab( get_itos(sp_model) ) mycust_tok = CustomTokenizer(SPTokenizer,sp_model,pre_rules=default_rules) all_texts_df = pd.read_csv('../data/haha_2019_train.csv') all_texts_df.funniness_average.fillna(0,inplace=True)
from set_seed import random_ctl seed = random_ctl(432286) #best seed from 20 seed search without mixup from fastai.text import * from fastai.callbacks import SaveModelCallback from fastai.layers import LabelSmoothingCrossEntropy import sentencepiece as spm #https://github.com/google/sentencepiece import fire from sp_tok import * from nlp_mixup import * from bin_metrics import Fbeta_binary from sklearn.model_selection import KFold def split_rebal_data_by_idx(all_texts_df: DataFrame, train_idx, valid_idx, clas_col: str = 'is_humor'): ## rebalance cases print('Number of positive samples:', (all_texts_df.loc[:, clas_col] == 1).sum()) print('Number of negative samples:', (all_texts_df.loc[:, clas_col] == 0).sum()) print('Total samples:', len(all_texts_df)) df_train_all = all_texts_df.iloc[train_idx, :] df_valid = all_texts_df.iloc[valid_idx, :] print('Valid prevalence(n = %d):' % len(df_valid),