def setup(self, stage=None): """ Downloads the data, parse it and split the data into train, test, validation data :param stage: Stage - training or testing """ # reading the input td.AG_NEWS(root="data", split=("train", "test")) extracted_files = os.listdir("data") train_csv_path = None for fname in extracted_files: if fname.endswith("train.csv"): train_csv_path = os.path.join(os.getcwd(), "data", fname) df = pd.read_csv(train_csv_path) df.columns = [ "label", "title", "description", ] df.sample(frac=1) df = df.iloc[:self.NUM_SAMPLES_COUNT] df["label"] = df.label.apply(self.process_label) if not os.path.isfile(self.VOCAB_FILE): filePointer = requests.get( self.VOCAB_FILE_URL, allow_redirects=True, ) if filePointer.ok: with open(self.VOCAB_FILE, "wb") as f: f.write(filePointer.content) else: raise RuntimeError("Error in fetching the vocab file") self.tokenizer = BertTokenizer(self.VOCAB_FILE) RANDOM_SEED = 42 seed_everything(RANDOM_SEED) df_train, df_test = train_test_split( df, test_size=0.2, random_state=RANDOM_SEED, stratify=df["label"], ) df_train, df_val = train_test_split( df_train, test_size=0.25, random_state=RANDOM_SEED, stratify=df_train["label"], ) self.df_train = df_train self.df_test = df_test self.df_val = df_val
def get_ag_news(num_samples): # reading the input td.AG_NEWS(root="data", split=("train", "test")) train_csv_path = "data/AG_NEWS/train.csv" return (pd.read_csv( train_csv_path, usecols=[0, 2], names=["label", "description"]).assign( label=lambda df: df["label"] - 1) # make labels zero-based .sample(n=num_samples))
def prepare_data(self): """ Creates train, valid and test dataloaders from the csv data """ td.AG_NEWS(root="data", split=("train", "test")) extracted_files = os.listdir("data/AG_NEWS") train_csv_path = None for fname in extracted_files: if fname.endswith("train.csv"): train_csv_path = os.path.join(os.getcwd(), "data/AG_NEWS", fname) self.df = pd.read_csv(train_csv_path) self.df.columns = ["label", "title", "description"] self.df.sample(frac=1) self.df = self.df.iloc[:self.NUM_SAMPLES_COUNT] self.df["label"] = self.df.label.apply(self.process_label) if not os.path.isfile(self.VOCAB_FILE): filePointer = requests.get(self.VOCAB_FILE_URL, allow_redirects=True) if filePointer.ok: with open(self.VOCAB_FILE, "wb") as f: f.write(filePointer.content) else: raise RuntimeError("Error in fetching the vocab file") self.tokenizer = BertTokenizer(self.VOCAB_FILE) RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) self.df_train, self.df_test = train_test_split( self.df, test_size=0.1, random_state=RANDOM_SEED, stratify=self.df["label"]) self.df_val, self.df_test = train_test_split( self.df_test, test_size=0.5, random_state=RANDOM_SEED, stratify=self.df_test["label"]) self.train_data_loader = self.create_data_loader( self.df_train, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE) self.val_data_loader = self.create_data_loader(self.df_val, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE) self.test_data_loader = self.create_data_loader( self.df_test, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE)
import torch.nn as nn from torch.utils.data import DataLoader, Dataset from torchtext import data, datasets from torchtext.vocab import Vocab from sentiment_classification.model import TextClassificationModel SEED = 0 PATH = './model_path/sentiment_model.pth' model_metadata = './model_path/metadata.json' vocab_data = './model_path/vocab.pk' torch.manual_seed(SEED) tokenizer = data.utils.get_tokenizer('basic_english') train_iter, test_iter = datasets.AG_NEWS(split=('train', 'test')) counter = Counter() label_set = set() for (label_, line) in train_iter: counter.update(tokenizer(line)) label_set.add(label_) vocab = Vocab(counter, min_freq=1) vocab_size = len(vocab) EMBEDDING_SIZE = 64 label_size = len(label_set) with open(model_metadata, 'w') as fp: json.dump( { 'embedding_size': EMBEDDING_SIZE, 'label_size': label_size,