def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
def build_dataset(fpath, mode='train'): # For more info about torchtext.data, # turn to https://pytorch.org/text/data.html tokenize = lambda x: x.split() ID = Field(sequential=False, use_vocab=False) # NOTE: CATEGORY_CODE could be ignored CATEGORY_CODE = LabelField(sequential=False, use_vocab=False) CATEGORY = LabelField(sequential=False, use_vocab=False) NEWS = Field( sequential=True, use_vocab=False, tokenize=tokenize, include_lengths=True, ) # Format of dataset: # 6552431613437805063_!_102_!_news_entertainment_!_谢娜为李浩菲澄清网络谣言,之后她的两个行为给自己加分_!_佟丽娅,网络谣言,快乐大本营,李浩菲,谢娜,观众们 fields = [ ('id', ID), ('category_code', CATEGORY_CODE), ('category', CATEGORY), ('news', NEWS), (None, None), ] # Since dataset is split by `_!_`. dataset = TabularDataset( fpath, format='csv', fields=fields, csv_reader_params={'delimiter': '_!_'}, ) return (ID, CATEGORY, NEWS), dataset
def load(self, text_label_col: str = "text", targets=('label', ), delimiter: str = ",", quotechar: str = '"'): field_headers = list( pd.read_csv(self.path_to_datadir + "train.csv", quotechar=quotechar, sep=delimiter)) dset_row = [] for header in field_headers: if header == text_label_col: dset_row.append((text_label_col, self.text_field)) elif header in targets: dset_row.append((header, LabelField(dtype=torch.long))) else: dset_row.append((header, None)) train, test = TabularDataset.splits(path=self.path_to_datadir, train='train.csv', test="test.csv", format='csv', skip_header=True, fields=dset_row) if self.stratified_sampling: train, test = stratified_sampler( train, test, targets, text_field=self.text_field, label_field=LabelField(dtype=torch.long)) return train, test
def load_dataset_from_csv(params, device): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # define tokenizer en = English() def tokenize(sentence): return [tok.text for tok in en.tokenizer(sentence)] TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128) LABEL = LabelField() fields_list = [('Unnamed: 0', None), ('text', TEXT), ('conf', None), ('label', LABEL)] base_path = params.DATA_PATH train_path = os.path.join(base_path, "filtered_train.csv") test_path = os.path.join(base_path, "filtered_test.csv") train_data = TabularDataset(path=train_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) test_data = TabularDataset(path=test_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) if params.VOCAB_USE_GLOVE: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300)) logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size())) else: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab))) train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE), sort_key=lambda x: len(x.text), repeat=False, shuffle=True, device=device) # Disable shuffle test_iter.shuffle = False return TEXT, word_embeddings, train_iter, test_iter
def clean_quora(path='../data/train.csv', output='list', tokenizer = nltk.word_tokenize, device=DEVICE, batch_size=32): data = pd.read_csv(path) questions1 = data['question1'].astype('str').tolist() questions2 = data['question2'].astype('str').tolist() is_duplicates = data['is_duplicate'].tolist() if output == 'list': return questions1, questions2, is_duplicates elif output == 'tokenized_list': return [tokenizer(q) for q in questions1], [tokenizer(q) for q in questions2], is_duplicates elif output == 'iterator' or output == 'iterator_from_file': TEXT = Field( sequential=True, tokenize = tokenizer, pad_first = False, dtype = torch.long, lower = True, batch_first = True ) TARGET = LabelField(use_vocab = False) if output == 'iterator': examples = [Example.fromlist((questions1[i], questions2[i], is_duplicates[i]), [('question1', TEXT), ('question2', TEXT) ('is_duplicate', TARGET)]) for i in range(len(questions1))] dataset = Dataset(examples, {'question1': TEXT, 'question2': TEXT, 'is_duplicate': TARGET}) if output == 'iterator_from_file': dataset = TabularDataset(path, 'csv', [('question1', TEXT), ('question2', TEXT), ('is_duplicate', TARGET)], skip_header=True) iterator = BucketIterator( dataset, batch_size=batch_size, sort_key=lambda x: len(x.question1) + len(x.question2), sort_within_batch=False, repeat = False, device = device # repeat=False # we pass repeat=False because we want to wrap this Iterator layer. ) TEXT.build_vocab(dataset) TARGET.build_vocab(dataset) return iterator #dataset = TabularDataset(path, 'csv', [('review', TEXT), ('sentiment', TARGET)]) else: raise ValueError('Processing type not understood')
def load_data(preprocessing=None): # Fields for the dataset # The actual review message #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, preprocessing=preprocessing) LABEL = LabelField(dtype=torch.float) # Get the entire dataset that we will then split data = TabularDataset(path=path, format='tsv', fields=[('text', TEXT), ('label', LABEL)]) # We should probabily look at the proportion of fake to non fake in each of these # set to make sure it is fairly even. Though probabilistically it should be I suppose train_data, valid_data, test_data = data.split( split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED)) #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED)) print('Size of train set: ' + str(len(train_data.examples))) print('Size of val / test: ' + str(len(valid_data.examples))) ''' # Try loading in the IMB dataset to label pos or negative train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # Get train/valid split!! train_data, valid_data = train_data.split(random_state=random.seed(SEED)) ''' # Now we need to build the vocab for our actual data # Here we will use the pre-trained word vetors from "glove.6b.100" TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d") LABEL.build_vocab(train_data) # Print stuff for sanity checks print('Size of the vocab: ' + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_itr, valid_itr, test_itr = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text)) return TEXT, train_itr, valid_itr, test_itr
def data_load_without_cv(fname, args, seed=1234, split_ratio=0.9): TEXT = Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True) LABEL = LabelField(sequential=False, dtype=torch.float) FIELDS = [('label', LABEL), ('text', TEXT)] dataset = TabularDataset(fname, fields=FIELDS, format='csv', skip_header=True) train_dataset, valid_dataset = dataset.split(random_state=random.seed(seed), split_ratio=split_ratio) TEXT.build_vocab(train_dataset) LABEL.build_vocab(train_dataset) train_iterator, valid_iterator = BucketIterator.splits((train_dataset, valid_dataset), batch_size=args.batch_size, device=args.device, sort=False, shuffle=True) return TEXT, train_iterator, valid_iterator
def make_dataset(train_csv, val_csv, test_csv): ''' Generates the training, validation and testing datasets as torchtext objects for easy incorporation with Pytorch (cleaning them in the process) Inputs: train_csv(str): name of training data csv val_csv(str): name of validation data csv test_csv(str): name of testing data csv Outputs: train: tabular dataset obj representing the training data test: tabular dataset obj representing the testing data val: tabular dataset obj representing the validation data text: torchtext field obj representing how text should be processed and stored label: torchtext labelfield obj representing labels should be processed and stored ''' text = Field(sequential=True, tokenize=word_tokenize, preprocessing=normalize_tokens) label = LabelField(dtype=torch.float) data_fields = [('dab_id', None), ('alj_id', None), ('alj_text', text), ('decision_binary', label), ('dab_year', None)] train, val, test = TabularDataset.splits(path='', train=train_csv, validation=val_csv, test=test_csv, format='csv', fields=data_fields, skip_header=True) return train, test, val, text, label
def build_and_cache_dataset(args, mode='train'): # TorchText采用声明式方法加载数据 # 声明Field对象,这个Field对象指定你想要怎么处理某个数据 # sequential序列化,use_vocab数字化,把单词映射成数字 ID = Field(sequential=False, use_vocab=False) # 不需要被序列化,不需要被数字化 CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) # 不需要序列化,需要被数字化,是目标 # tokenize传入一个函数,表示如何将文本字符串,切分成词或者字 NEWS_TEXT = Field( sequential=True, # 需要被序列化 tokenize=jieba.lcut, # 使用jieba分词切分 include_lengths=True, # 返回小型批处理的元组、长度列表 ) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news_text', NEWS_TEXT), ] logger.info("Creating features from dataset file at %s", args.data_dir) # Since dataset is split by `\t`. # 把train.csv中的每一行按\t做了划分,每一行生成一个Example对象 dataset = TabularDataset( os.path.join(args.data_dir, f'{mode}.csv'), # 数据集路径 format='csv', # 数据集格式 fields=fields, # 按何种方式处理数据,处理后每行数据符合'id': ,'category': ,'news_text': 的格式 csv_reader_params={'delimiter': '\t'}, # 每行数据按\t划分,然后在按field定义的方式处理数据 ) features = ((ID, CATEGORY, NEWS_TEXT), dataset) return features
def load(self, text_label_col: str = "text", targets=('label', ), delimiter: str = ",", quotechar: str = '"'): field_headers = list( pd.read_csv(self.path_to_datadir, quotechar=quotechar)) dset_row = [] for header in field_headers: if header == text_label_col: dset_row.append((text_label_col, self.text_field)) elif header in targets: dset_row.append((header, LabelField(dtype=torch.long))) else: dset_row.append((header, None)) train = TabularDataset(path=self.path_to_datadir, format="csv", fields=dset_row, skip_header=True, csv_reader_params={ "delimiter": delimiter, "quotechar": quotechar }) return train
def build_and_cache_dataset(args, mode='train'): ID = Field(sequential=False, use_vocab=False) CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) NEWS = Field( sequential=True, tokenize=jieba.lcut, #可以使用自定义函数,即别的分词工具 include_lengths=True, ) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news', NEWS), ] logger.info("Creating features from dataset file at %s", args.data_dir) # Since dataset is split by `\t`. dataset = TabularDataset( os.path.join(args.data_dir, f'{mode}.csv'), format='csv', fields=fields, csv_reader_params={'delimiter': '\t'}, ) features = ((ID, CATEGORY, NEWS), dataset) return features
def build_and_cache_dataset(config: Config, mode='train'): """ 返回每个属性的Field,以及所有的属性的值 (id, category, news), datasets (Field, Field, Field), TabularDataset """ # id 已经序列化 ID = Field(sequential=False, use_vocab=False) CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) NEWS = Field( sequential=True, tokenize=jieba.lcut, include_lengths=True, ) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news', NEWS), ] logger.info("从当前目录创建特征 %s", config.dataset_dir) # `\t` 分割 dataset = TabularDataset( os.path.join(config.dataset_dir, f'{mode}.csv'), format='csv', fields=fields, csv_reader_params={'delimiter': '\t'}, ) # TabularDataset.split() features = ((ID, CATEGORY, NEWS), dataset) return features
def __init__(self, spacy_model_name, question_vocab_dir, answer_vocab_dir, right_answer_col, nn_weights_path, batch_size=128, device=tt.device('cpu')): self.parser = spacy.load(spacy_model_name) vocab = load_vocab(question_vocab_dir) preprocess = lambda x: [i if i in vocab.stoi else '<unk>' for i in x] self.TOKENS = Field(lower=True, preprocessing=preprocess) self.TOKENS.vocab = load_vocab(question_vocab_dir) self.ANSWER = LabelField(dtype=tt.int64, use_vocab=True, unk_token='<unk>') self.ANSWER.vocab = load_vocab(answer_vocab_dir) #self.ANSWER.vocab.vectors = Vectors(gensim_vectors_path) self.device = device self.nn_weights_path = nn_weights_path self.batch_size = batch_size self.model = None self.right_answer_col = right_answer_col
def __build_field(self): self.TEXT = Field(sequential=True, use_vocab=True, lower=True, tokenize=tokenizer, include_lengths=True, batch_first=self._config.data.batch_first, pad_token='[PAD]', unk_token='[UNK]') # self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, # batch_first=self._config.data.batch_first) self.TAG = LabelField( sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, ) self._fields = [('text', self.TEXT), ('tag', self.TAG)] pass
class ReviewsDataset(): def __init__(self, data_path, train_path): ## write the tokenizer tokenize = lambda review: review.split() ## define your fields for ID filed you can use RAWField class self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) self.LABEL = LabelField() self.fields = [ ("PhraseId", None ), # we won't be needing the id, so we pass in None as the field ("SentenceId", None), ("Phrase", self.TEXT), ("Sentiment", self.LABEL) ] #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) } ## set paths self.data_path = data_path self.train_path = train_path def load_data(self): self.train_data = TabularDataset.splits( path='{}'.format(self.data_path), train='{}'.format(self.train_path), format='tsv', fields=self.fields)[0] self.TEXT.build_vocab(self.train_data, max_size=10000, min_freq=1) self.LABEL.build_vocab(self.train_data) self.train_iterator, _ = BucketIterator.splits( (self.train_data, None), batch_sizes=(64, 64), sort_within_batch=True, sort_key=lambda x: len(x.Phrase)) def __str__(self): return 'review: {} \n sentiment: {}'.format( self.train_data[0][0].__dict__['r'], self.train_data[0][0].__dict__['s'])
def _build_fields(self) -> Dict[str, Field]: fields = { 'syllable_contents': Field(sequential=True, use_vocab=True, batch_first=True), 'label': LabelField(sequential=False, use_vocab=False, dtype=torch.float32, batch_first=True) } return fields
def construct_field( field_type, batch_first=True, input_lower=True, lemmatized=False, input_include_lengths=True, input_fix_length=None, ): """ Construct TorchText field. Note: the `input_<x>` fields are specifically parameters for the `input_text` field type. """ if field_type == 'input_text': if lemmatized: tokenizer = tokenize_fct_lemmatize else: tokenizer = tokenize_fct return SplitReversibleField(sequential=True, use_vocab=True, init_token=Constants.START_TOKEN, eos_token=Constants.END_TOKEN, lower=input_lower, tokenize=tokenizer, batch_first=batch_first, pad_token=Constants.PAD_TOKEN, unk_token=Constants.UNK_TOKEN, include_lengths=input_include_lengths, fix_length=input_fix_length, preprocessing=gen_text_preprocessor()) elif field_type == 'numeric_label': return LabelField( use_vocab=False, batch_first=batch_first, ) elif field_type == 'bool_label': return LabelField(use_vocab=False, batch_first=batch_first, preprocessing=lambda x: (x == 'True')) else: raise Exception('Invalid Field Type')
def __init__(self, data_path, train_path): ## write the tokenizer tokenize = lambda review: review.split() ## define your fields for ID filed you can use RAWField class self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) self.LABEL = LabelField() self.fields = [ ("PhraseId", None ), # we won't be needing the id, so we pass in None as the field ("SentenceId", None), ("Phrase", self.TEXT), ("Sentiment", self.LABEL) ] #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) } ## set paths self.data_path = data_path self.train_path = train_path
def pre_process_text(): ID = Field(sequential=False, use_vocab=False) # 处理CATEGORY,标签选择非序列,use_vocab置true建立词典,is_target置true指明这是目标变量 CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) # 处理NEWS,文本选择序列,分词函数用jieba的lcut,返回句子原始长度方便RNN使用 NEWS = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news', NEWS), ] # 加载数据 train_data = TabularDataset( os.path.join('data', 'train.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) valid_data = TabularDataset( os.path.join('data', 'dev.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) test_data = TabularDataset( os.path.join('data', 'test.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) # 创建字典 NEWS.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) CATEGORY.build_vocab(train_data) return CATEGORY, NEWS, train_data, valid_data, test_data
def create_field(field_type,tokenizer=None): """Return a Field-like object using specified config.""" if field_type == "label": return LabelField(dtype = torch.long) elif field_type == "text": return Field(tokenize = 'spacy', lower= True) elif field_type == "contextual": return Field(batch_first = True, use_vocab = False, tokenize = bert_tokenize_and_cut(tokenizer), preprocessing = tokenizer.convert_tokens_to_ids, init_token = tokenizer.cls_token_id, eos_token = tokenizer.sep_token_id, pad_token = tokenizer.pad_token_id, unk_token = tokenizer.unk_token_id) else: raise ValueError(f'{field_type} was not recognized')
def __init__(self, data_path, test=False, stop_words_path=None, bert_model_path=None, batch_first=False, include_lengths=False, tokenizer_language='cn'): """ :param data_path: :param test: 如果为测试集,则不加载label :param stop_words_path: :param batch_first: :param include_lengths: """ self.data = pd.read_csv(data_path) print('read data from {}'.format(data_path)) self.text_field = "review" self.label_field = "label" self.test = test if stop_words_path: stop_words = read_stop_words(stop_words_path) else: stop_words = None self.LABEL = LabelField(sequential=False, use_vocab=False, dtype=torch.float) # lambda x: [y for y in x] # bert_tokenizer = BertTokenizer.from_pretrained(bert_model_path) # pad_index = bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.pad_token) # unk_index = bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.unk_token) self.TEXT = Field( use_vocab=True, sequential=True, stop_words=stop_words, tokenize=lambda x: [y for y in x], batch_first=batch_first, tokenizer_language=tokenizer_language, include_lengths=include_lengths) # include_lengths=True for LSTM self.fields = [("text", self.TEXT), ("label", self.LABEL)] self.examples = self.build_examples()
def __init__( self, data, text_field, label_field, test=False, stop_words_path=None, batch_first=False, include_lengths=False, tokenizer_language='cn', ): if stop_words_path: stop_words = read_stop_words(stop_words_path) else: stop_words = None self.LABEL = LabelField(sequential=False, use_vocab=False, dtype=torch.float) # lambda x: [y for y in x] self.TEXT = Field( sequential=True, stop_words=stop_words, tokenize=lambda x: [y for y in x], batch_first=batch_first, tokenizer_language=tokenizer_language, include_lengths=include_lengths) # include_lengths=True for LSTM fields = [("text", self.TEXT), ("label", self.LABEL)] examples = [] if test: # 如果为测试集,则不加载label for text in tqdm(data[text_field]): examples.append(Example.fromlist([text, None], fields)) else: for text, label in tqdm(zip(data[text_field], data[label_field])): # Example: Defines a single training or test example. # Stores each column of the example as an attribute. examples.append(Example.fromlist([text, label], fields)) # 之前是一些预处理操作,此处调用super调用父类构造方法,产生标准Dataset super(NLPDataset, self).__init__(examples, fields)
def build_and_cache_dataset(data_path=r"E:\Workspaces\Python\KG\QA_healty39\data"): """ 返回每个属性的Field,以及所有的属性的值 (id, category, news), datasets (Field, Field, Field), TabularDataset """ QUESTION = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True) INTENTION = LabelField(sequential=False, use_vocab=True, is_target=True) fields = [ ('question', QUESTION), ('intention', INTENTION), ] # `\t` 分割 dataset = TabularDataset( os.path.join(data_path, 'qa.csv'), format='csv', fields=fields, csv_reader_params={'delimiter': '\t'}, ) features = ((QUESTION, INTENTION), dataset) return features
def _build_loader(self): print("Loading data...") TEXT = Field(batch_first=True, fix_length=self.args.max_words) LABEL = LabelField(sequential=False, batch_first=True, use_vocab=False) field = [('text', TEXT), ('label', LABEL)] train = get_dataset("train", field) test = get_dataset("test", field) evl = get_dataset("eval", field) TEXT.build_vocab(train, test, evl, min_freq=3) self.vocab = TEXT self.train_iter, self.test_iter, self.evl_iter = BucketIterator.splits( (train, test, evl), batch_sizes=(self.args.batch_size, self.args.batch_size, self.args.batch_size), device=self.device, shuffle=True, sort=False, repeat=False, )
def load(self, delimiter: str = ",", quotechar: str = '"', text_col_name: str = 'text', label_col_name: str = 'label') -> TabularDataset: """ This methods is responsible for loading in the data from the csv file and converting it into a torchtext TabularDataset, it will automatically only select the columns from the file that are specified by the 'text_col_name' and 'label_col_name' parameters :param delimiter: string specifying the delimiter used when reading in the csv file :param quotechar: string specifying the quotechar used when reading in the csvfile :param text_col_name: string specifying the name of the column in the csv file containing \ the text of the data point :param label_col_name: string specifying the name of the column in the csv file containing the \ label of the datapoint :return: torch.data.TabularDataset """ file_headers = list( pd.read_csv(self.file_name, sep=delimiter, quotechar=quotechar)) dset_row = [] for header in file_headers: if header == text_col_name: dset_row.append((text_col_name, self.text_field)) elif header == label_col_name: dset_row.append((label_col_name, LabelField(dtype=torch.long))) else: dset_row.append((header, None)) dataset = TabularDataset(path=self.file_name, format="csv", fields=dset_row, skip_header=True, csv_reader_params={ "delimiter": delimiter, "quotechar": quotechar }) return dataset
def prepare_fields_word_char(columns, word_ids_col, char_ids_col, len_col, mask_col, inf_mask_col): long_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long) float_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) label_field = LabelField(sequential=False, use_vocab=True, batch_first=True) fields = list() for column in columns: if column in (word_ids_col, char_ids_col, len_col, mask_col): fields.append((column, long_field)) elif column in (inf_mask_col, ): fields.append((column, float_field)) else: fields.append((column, label_field)) return long_field, float_field, label_field, fields
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default='rnn', help= "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'" ) parser.add_argument('--train_data_path', type=str, default="./data/train_clean.csv", help="Path to the training data") parser.add_argument('--test_data_path', type=str, default="./data/dev_clean.csv", help="Path to the test data") parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--vectors', type=str, default='fasttext.simple.300d', help=""" Pretrained vectors: Visit https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146 for more """) parser.add_argument('--max_vocab_size', type=int, default=750) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--bidirectional', type=bool, default=True) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--hidden_dim', type=int, default=64) parser.add_argument('--output_dim', type=int, default=1) parser.add_argument('--n_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--n_epochs', type=int, default=5) parser.add_argument('--n_filters', type=int, default=100) parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5]) args = parser.parse_args() torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ########## BILSTM ########## if args.model == "bilstm": print('\nBiLSTM') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) pad_idx = TEXT.vocab.stoi[TEXT.pad_token] unk_idx = TEXT.vocab.stoi[TEXT.unk_token] model = BiLSTM(input_dim, embedding_dim, args.hidden_dim, args.output_dim, args.n_layers, args.bidirectional, args.dropout, pad_idx) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim) model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.139, Test Acc: 95.27% ########## VANILLA RNN ########## else: print('\nVanilla RNN') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.138, Test Acc: 95.05%
def load_dataset(batch_size, cache_data=True, test_sen=None): if cache_data: print("Caching Data") office_actions = pd.read_csv( '../data/office_actions.csv', index_col='app_id', usecols=['app_id', 'rejection_102', 'rejection_103'], dtype={ 'app_id': int, 'rejection_102': int, 'rejection_103': int }, nrows=200000) abstractList = [] idList = [] rejectionColumn = [] obviousCount = 0 notCount = 0 path = "/scratch/dm4350/json_files/" count = 0 for filename in os.listdir(path): if count % 1000 == 0: print(count) filepath = path + filename try: jfile = open(filepath, 'r') except FileNotFoundError: print("File Not Found") continue try: parsed_json = json.load(jfile) jfile.close() except UnicodeDecodeError: print("WARNING: UnicodeDecodeError") continue except json.decoder.JSONDecodeError: print("WARNING: JSONDecodeError") continue app_id = int( filename.replace("oa_", "").replace(".json", "").replace("(1)", "")) try: row = office_actions.loc[app_id] except KeyError: print("WARNING: KeyError") continue try: n = int(row.rejection_102) o = int(row.rejection_103) except TypeError: n = int(row.rejection_102.iloc[0]) o = int(row.rejection_103.iloc[0]) if n == 0 and o == 0: rejType = 0 #neither elif n == 0 and o == 1: rejType = 1 #obvious elif n == 1 and o == 0: rejType = 0 #novelty elif n == 1 and o == 1: rejType = 1 #both else: print("Office actions dataframe error:", sys.exc_info()[0]) raise if obviousCount >= notCount and rejType == 1: continue obviousCount += o notCount += not (o) # Skip any files not in the appropriate IPC class try: found_A61 = False for s in parsed_json[0]['ipc_classes']: if (s.find("A61") != -1): found_A61 = True if not found_A61: continue except: print("WARNING: file " + filepath + " is empty!\n") continue # Read in data from json file if it exists try: a = parsed_json[0]['abstract_full'] i = parsed_json[0]['application_number'] except IndexError: print("WARNING: file " + filepath + " is empty!\n") continue except KeyError: print("WARNING: file " + filepath + " is empty!\n") continue abstractList.append(a) idList.append(i) rejectionColumn.append(rejType) count += 1 #if count > 2000: break df = pd.DataFrame({ 'text': abstractList, 'label': rejectionColumn }, index=idList) print("{} files loaded".format(count)) df.to_pickle('./data_cache/abstracts_df_A61.pkl') # with open("data_cache/TEXT.Field","wb")as f: # dill.dump(TEXT,f) # with open("data_cache/LABEL.Field","wb")as f: # dill.dump(LABEL,f) else: print('Loading Dataset from Cache') df = pd.read_pickle('./data_cache/abstracts_df_A61.pkl') # with open("data_cache/TEXT.Field","rb")as f: # TEXT=dill.load(f) # with open("data_cache/LABEL.Field","rb")as f: # LABEL=dill.load(f) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = LabelField(sequential=False) fields = {'text': TEXT, 'label': LABEL} ds = DataFrameDataset(df, fields) TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(ds) train_data, test_data = ds.split() train_data, valid_data = train_data.split( ) # Further splitting of training_data to create new training_data & validation_data word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) train_iter, valid_iter, test_iter = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) # non-transferrable types primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}] for batch in primitive_objects: data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)]) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
tensor = tensor.unsqueeze(0).to(device) output = model(tensor) output = F.softmax(output, dim=-1) print(output) if __name__ == "__main__": text_field = Field(use_vocab=False, tokenize=tokenize_and_trunc, preprocessing=tokenizer.convert_tokens_to_ids, batch_first=True, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) label_field = LabelField() train_data, test_data = IMDB.splits(text_field, label_field) train_data, valid_data = train_data.split() label_field.build_vocab(train_data) n_epochs = 5 batch_size = 128 rnn_hidden_size = 256 dropout_p = 0.2 num_classes = len(label_field.vocab) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model = BertGRU(bert.config.to_dict()['dim'], rnn_hidden_size, num_classes=num_classes, dropout_p=dropout_p)