class CharCorpus(object): def __init__(self, input_folder, min_word_freq, batch_size, wv_file=None): # list all the fields self.word_field = Field(lower=True) # [sent len, batch_size] self.tag_field = Field(unk_token=None) # [sent len, batch_size] ### BEGIN MODIFIED SECTION: CHARACTER EMBEDDING ### self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField( self.char_nesting_field) # [batch_size, sent len, word len] # create dataset using built-in parser from torchtext self.train_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=input_folder, train="train.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("tag", self.tag_field))) ### END MODIFIED SECTION ### # convert fields to vocabulary list if wv_file: self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file) self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq) vectors = [] for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) # build vocab for tag and characters self.char_field.build_vocab(self.train_dataset.char) # NEWLY ADDED self.tag_field.build_vocab(self.train_dataset.tag) # create iterator for batch input self.train_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.test_dataset), batch_size=batch_size) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] # NEWLY ADDED self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token]
def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset): field_char = NestedField(Field( pad_token=PAD_WORD, tokenize=list, init_token=SOS_WORD, eos_token=EOS_WORD, batch_first=True), pad_token=PAD_WORD, ) field_word = Field(batch_first=True) dataset_char = dataset_cls.splits(field_char) dataset_word = dataset_cls.splits(field_word) field_char.build_vocab(dataset_char[0]) field_word.build_vocab(dataset_word[0]) return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char
def load_dataset(config, device): label_dict = {"observing": 0, "against": 1, "for": 2} LABEL = Field(use_vocab = False, sequential = False,\ dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()]) SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = True) SENT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = False) DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \ preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\ include_lengths = True) fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\ ('abst', SEQ), ('body', DOC)] train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\ fields = fields, train = config.train_file, test = config.test_file) train, val = train.split(split_ratio=0.80) vectors = GloVe(name="6B", dim=config.embed_dim, cache='/users4/jwduan/vectors/') DOC.build_vocab(train, val, test, vectors=vectors) SEQ.build_vocab() SEQ.vocab = DOC.vocab config.vocab_size = len(DOC.vocab) train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\ batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True, device = device, shuffle = True, repeat = False) return (train_loader, val_loader, test_loader), DOC.vocab.vectors
def __init__(self, glove=True, device=device): self.device = device nlp = spacy.load("en_core_web_sm") char_nesting = Field(batch_first=True, tokenize=list, lower=True) char = NestedField(char_nesting, init_token="<sos>", eos_token="<eos>", tokenize="spacy") word = Field(init_token="<sos>", eos_token="<eos>", lower=True, tokenize="spacy") label = Field(sequential=False, is_target=True, use_vocab=False) self.fields = [("question_char", char), ("question_word", word), ("context_char", char), ("context_word", word), ("answer", label)] self.dict_fields = { "question": [("question_char", char), ("question_word", word)], "context": [("context_char", char), ("context_word", word)], "answer": ("answer", label) } self.train_data = self._get_data("../data/train.jsonl") self.dev_data = self._get_data("../data/dev.jsonl") char.build_vocab(self.train_data) if glove: word.build_vocab(self.train_data, vectors=GloVe(name="6B", dim=100)) else: word.build_vocab(self.train_data, vectors=FastText(language='en', max_vectors=30000)) self.char_vocab = char.vocab self.word_vocab = word.vocab pos = [] ner = [] ind2pos = [] ind2ner = [] for data in tqdm(self.train_data): doc = nlp(' '.join(data.question_word + data.context_word)) # t - token pos.extend([t.pos_ for t in doc]) ner.extend([t.label_ for t in doc.ents]) ind2pos.extend([[self.word_vocab.stoi[str(t)], t.pos_] for t in doc]) ind2ner.extend([[self.word_vocab.stoi[str(t)], t.label_] for t in doc.ents]) self.pos_voc = {tag: i for i, tag in enumerate(set(pos))} self.ner_voc = {tag: i + 1 for i, tag in enumerate(set(ner))} self.ner_voc['None'] = 0 # default values, used in DrQA model self.ind2pos = defaultdict(lambda: self.pos_voc['X']) # returns 14 self.ind2ner = defaultdict(lambda: self.ner_voc['None']) # returns 0 self.ind2pos.update({tag[0]: self.pos_voc[tag[1]] for tag in ind2pos}) self.ind2ner.update({tag[0]: self.ner_voc[tag[1]] for tag in ind2ner})
class Corpus(object): def __init__(self, data_path, vector_path, glove6b, embedding_dim, min_word_freq, max_vocab_size, batch_size, device, test, prefix): ''' class for interacting with dataset data_path: root path for dataset directory vector_path: path for vector_cache glove6b: switch for using glove.6b pretrained embeddings embedding_dim: dimension of embedding (50, 100, 200, or 300 for glove.6b) min_word_freq: ignore words that don't meet the frequency threshold in the text field max_vocab_size: maximum size of the vocabulary of the text field batch_size: batch size for data iterators device: torch device test: switch for whether the dataset is a test (torchtext) set that is hopefully more likely to work prefix: prefix to be appended to data path ''' # set all of the attributes self.data_path, self.vector_path, self.glove6b = data_path, vector_path, glove6b self.embedding_dim, self.min_word_freq, self.max_vocab_size = embedding_dim, min_word_freq, max_vocab_size self.batch_size = batch_size self.device, self.test, self.prefix = device, test, prefix # initialize text and tag fields self.initialize_fields() # load dataset self.load_data() # build vocabularies from text and tag data self.build_vocabularies() # build iterators for batches of train, dev, and test sets self.initialize_iterators() # initialize indices of padding and unknown tokens self.init_idxs() def initialize_fields(self): ''' initializes fields ''' # initialize the text field with the spacy tokenizer and no casing self.text_field = Field(tokenize='spacy', lower=True, batch_first=True) # initialize the tag field without an unknown token (hopefully the train set contains all of the tags) self.tag_field = Field(unk_token=None, batch_first=True) # initialize the character field char_nesting_field = Field(tokenize=list, batch_first=True) self.char_field = NestedField(char_nesting_field) self.pad_token = self.text_field.pad_token def load_data(self): ''' load data from file using torchtext ''' if self.test: # built-in datasets if self.prefix == 'udpos': self.train_set, self.valid_set, self.test_set = UDPOS.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field), ('pos', None)), root=self.data_path) if self.prefix == 'conll2000': self.train_set, self.valid_set, self.test_set = CoNLL2000Chunking.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('pos', None), ('tag', self.tag_field)), root=self.data_path) else: # load datasets from pre-prepared tsv files self.train_set, self.valid_set, self.test_set = SequenceTaggingDataset.splits( fields=((('text', 'char'), (self.text_field, self.char_field)), ('tag', self.tag_field)), path=self.data_path + '/{}'.format(self.prefix), train='train.tsv', validation='dev.tsv', test='test.tsv') def build_vocabularies(self): ''' builds vocabularies for the text and tag data ''' # if a vector path is provided, then we have to make sure that the word vectors are handled if self.vector_path: if self.glove6b: # the way to do this is built-in with glove.6b self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq, vectors='glove.6B.{}d'.format( self.embedding_dim), vectors_cache=self.vector_path, unk_init=torch.Tensor.normal_) else: # not sure if this is working self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq, vectors_cache=self.vector_path) ########################################################################### # not currently working due to conflict between gensim and python version # ########################################################################### # self.wv_model = gensim.models.word2vec.Word2Vec.load(wv_file) # self.embedding_dim = self.wv_model.vector_size # word_freq = {word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab} # word_counter = Counter(word_freq) # self.word_field.vocab = Vocab(word_counter, min_freq=min_word_freq) # vectors = [] # for word, idx in self.word_field.vocab.stoi.items(): # if word in self.wv_model.wv.vocab.keys(): # vectors.append(torch.as_tensor(self.wv_model.wv[word].tolist())) # else: # vectors.append(torch.zeros(self.embedding_dim)) # self.word_field.vocab.set_vectors(stoi=self.word_field.vocab.stoi, vectors=vectors, dim=self.embedding_dim) else: # no vectors required self.text_field.build_vocab(self.train_set.text, max_size=self.max_vocab_size, min_freq=self.min_word_freq) # build vocabulary for the tags (nothing fancy needed) self.char_field.build_vocab(self.train_set.char) self.tag_field.build_vocab(self.train_set.tag) def initialize_iterators(self): ''' build iterators for data (by batches) using the bucket iterator ''' self.train_iter, self.valid_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_set, self.valid_set, self.test_set), batch_size=self.batch_size, device=self.device, random_state=seed) def init_idxs(self): ''' saves indices for padding and unknown tokens ''' self.text_pad_idx = self.text_field.vocab.stoi[ self.text_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] self.tag_pad_idx = self.tag_field.vocab.stoi[self.tag_field.pad_token] self.text_unk_idx = self.text_field.vocab.stoi[ self.text_field.unk_token]
class Corpus(object): def __init__(self, args): # list all the fields self.word_field = Field(lower=True) self.event_field = Field(unk_token=None) self.entity_field = Field(unk_token=None) self.argument_field = Field(unk_token=None) self.trigger_pos_field = Field(unk_token=None) self.char_nesting_field = Field(tokenize=list) self.char_field = NestedField(self.char_nesting_field) self.wv = args.wv_file # create dataset using built-in parser from torchtext self.train_dataset, self.val_dataset, self.test_dataset = SequenceTaggingDataset.splits( path=args.input_folder, train="train.txt", validation="dev.txt", test="test.txt", fields=((("word", "char"), (self.word_field, self.char_field)), ("event", self.event_field), ("entity", self.entity_field), ("argument", self.argument_field), ("trigger_pos", self.trigger_pos_field)), ) # convert fields to vocabulary list # self.word_field.build_vocab(self.train_dataset.word, min_freq=min_word_freq) self.event_field.build_vocab(self.train_dataset.event) # create iterator for batch input if args.wv_file: print("start loading embedding") self.wv_model = gensim.models.KeyedVectors.load_word2vec_format( args.wv_file, binary=True) print("done loading embedding") self.embedding_dim = self.wv_model.vector_size word_freq = { word: self.wv_model.wv.vocab[word].count for word in self.wv_model.wv.vocab } word_counter = Counter(word_freq) self.word_field.vocab = Vocab(word_counter, min_freq=args.min_word_freq) # mapping each vector/embedding from word2vec model to word_field vocabs vectors = [] print("start loading vec", len(self.word_field.vocab.stoi)) for word, idx in self.word_field.vocab.stoi.items(): if word in self.wv_model.wv.vocab.keys(): vectors.append( torch.as_tensor(self.wv_model.wv[word].tolist())) else: vectors.append(torch.zeros(self.embedding_dim)) print("done loading vec") del self.wv_model self.word_field.vocab.set_vectors( stoi=self.word_field.vocab.stoi, # list of vector embedding, orderred according to word_field.vocab vectors=vectors, dim=self.embedding_dim) else: self.word_field.build_vocab(self.train_dataset.word, min_freq=args.min_word_freq) self.char_field.build_vocab(self.train_dataset.char) self.entity_field.build_vocab(self.train_dataset.entity) self.argument_field.build_vocab(self.train_dataset.argument) self.trigger_pos_field.build_vocab(self.train_dataset.trigger_pos) self.train_iter, self.val_iter, self.test_iter = BucketIterator.splits( datasets=(self.train_dataset, self.val_dataset, self.test_dataset), batch_size=args.batch_size, shuffle=False, ) # prepare padding index to be ignored during model training/evaluation self.word_pad_idx = self.word_field.vocab.stoi[ self.word_field.pad_token] self.event_pad_idx = self.event_field.vocab.stoi[ self.event_field.pad_token] self.char_pad_idx = self.char_field.vocab.stoi[ self.char_field.pad_token] self.entity_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token] self.argument_pad_idx = self.entity_field.vocab.stoi[ self.entity_field.pad_token]
def load_data(train_file, test_file, pretrain=None, save_dir=None): assert os.path.exists(train_file), f"{train_file} is not exist!" assert os.path.exists(test_file), f"{test_file} is not exist!" print("=" * 30 + "DATASET LOADER" + "=" * 30) sent_field = Field(tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', init_token=None, eos_token=None) doc_field = NestedField(sent_field, tokenize=sent_tokenize, pad_token='<pad>', init_token=None, eos_token=None, include_lengths=True) label_field = LabelField() fields = [("raw", RawField()), ("doc", doc_field), ("label", label_field)] print(f"Reading {train_file} ...") with open(train_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) train_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of train examples: {len(examples)}") print(f"Reading {test_file} ...") with open(test_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) test_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of valid examples: {len(examples)}") vectors = FastText('vi') doc_field.build_vocab(train_dataset, test_dataset, vectors=vectors) label_field.build_vocab(train_dataset, test_dataset) print(f"Building vocabulary ...") num_vocab = len(doc_field.vocab) num_classes = len(label_field.vocab) pad_idx = doc_field.vocab.stoi['<pad>'] print(f"\tNum of vocabulary: {num_vocab}") print(f"\tNum of classes: {num_classes}") if save_dir: with open(save_dir + "/vocab.json", "w", encoding="utf-8") as fv: vocabs = { "word": doc_field.vocab.stoi, "class": label_field.vocab.itos, 'pad_idx': pad_idx } json.dump(vocabs, fv) fv.close() with open(save_dir + "/fileds.json", "w", encoding="utf-8") as ff: field_vocabs = { "doc": doc_field.vocab.freqs, "label": label_field.vocab.freqs } json.dump(field_vocabs, ff) ff.close() print("=" * 73) return train_dataset, test_dataset, num_vocab, num_classes, pad_idx, vectors.vectors
class Data(object): WORDS_NAME = "words" LAB_NAME = "lab" CHAR_NAME = "char" def __init__( self, train_path: str, unlabeled_path: str, semi_supervised: bool, dev_path: str = None, test_path: str = None, batch_size: int = 32, device: object = None, logger: typing.Optional[logging.Logger] = None, ) -> None: if logger is None: logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(levelname)s - %(name)s - %(message)s')) logger.addHandler(handler) self.train_path = train_path self.dev_path = dev_path self.test_path = test_path self.unlabeled_path = unlabeled_path self.batch_size = batch_size self.semi_supervised = semi_supervised self.device = device self.logger = logger def initialize(self): ## initialize fields and create dataset ## self._init_fields() self._read_sentences() self.train = self._make_bucket_iterator(self._make_dataset(False), batch_size=self.batch_size, device=self.device) self.dev = self._make_bucket_iterator(self._make_dataset(False, which="dev"), batch_size=self.batch_size, device=self.device) self.test = self._make_bucket_iterator(self._make_dataset( False, which="test"), batch_size=self.batch_size, device=self.device) # self.unlabeled_train = self._make_bucket_iterator(self._make_dataset(True), # batch_size=self.batch_size, device=self.device) self.unlabeled_data = self._make_dataset(True) self._build_vocabularies() def _read_sentences(self): self.train_sentences = [] with open(self.train_path) as f: for line in f: self.train_sentences.append(line.replace("\n", "")) self.logger.info('{} train sentences successfully read'.format( len(self.train_sentences))) self.dev_sentences = [] with open(self.dev_path) as f: for line in f: self.dev_sentences.append(line.replace("\n", "")) self.logger.info('{} dev sentences successfully read'.format( len(self.dev_sentences))) self.unlabeled_sentences = [] temp = [] with open(self.unlabeled_path) as f: for line in f: sen_len = len(line.split()) if sen_len > 0 and sen_len <= 20: temp.append(line.replace("\n", "")) #self.unlabeled_sentences = random.sample(temp, 101420) self.unlabeled_sentences = temp self.logger.info('{} unlabeled sentences successfully read'.format( len(self.unlabeled_sentences))) self.test_sentences = [] with open(self.test_path) as f: for line in f: self.test_sentences.append(line.replace("\n", "")) self.logger.info('{} test sentences successfully read'.format( len(self.train_sentences))) def _init_fields(self): self.words = Field(batch_first=True, init_token='<s>', eos_token='</s>') self.lab = Field(batch_first=True, unk_token=None, pad_token=None) # self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>') # , init_token='<s>', eos_token='</s>') self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>', init_token='<w>', eos_token='</w>'), init_token='<s>', eos_token='</s>') self.labeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char), (self.LAB_NAME, self.lab)] self.unlabeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char)] self.logger.info('fields initialized successfully') def _make_dataset(self, unlabeled, which=None) -> Dataset: if not unlabeled: sentences = self.train_sentences if which == "dev": sentences = self.dev_sentences elif which == "test": sentences = self.test_sentences examples = [self._make_example(s) for s in sentences] return Dataset(examples, self.labeled_fields) else: sentences = self.unlabeled_sentences examples = [self._make_example_unlabeled(s) for s in sentences] return Dataset(examples, self.unlabeled_fields) def _make_example(self, sent) -> Example: cols = sent.split("\t") words = [word for word in cols[0].split()] tags = [tag for tag in cols[1].split()] return Example.fromlist([words, words, tags], self.labeled_fields) def _make_example_unlabeled(self, sent) -> Example: words = [word for word in sent.split()] return Example.fromlist([words, words], self.unlabeled_fields) def _make_bucket_iterator(self, data, batch_size=32, device=None): # return BucketIterator( # dataset=data, batch_size=batch_size, # sort=False, sort_within_batch=True, # sort_key=lambda x: len(x.words), # device=device, repeat=False) return GroupedBucketIterator(data, batch_size, lambda ex: len(ex.words), device=device) def _build_vocabularies(self): self.words.build_vocab(self.train.dataset) self.lab.build_vocab(self.train.dataset) self.char.build_vocab(self.train.dataset) self.num_words = len(self.words.vocab) self.num_tags = len(self.lab.vocab) self.num_char = len(self.char.vocab) self.logger.info( 'Found %d words, %d chars, and %d tags for both the labeled and unlabeled dataset', self.num_words, self.num_char, self.num_tags) def _get_unlabeled_sentences(self): while True: for us in self.unlabeled_sentences: yield us def _get_unlabeled_examples(self): #while True: lines = [] for words in self._get_unlabeled_sentences(): lines.append(words) if len(lines) >= 10142: yield [self._make_example_unlabeled(line) for line in lines] lines = [] def _endless_unlabeled(self): #while True: for ex in self._get_unlabeled_examples(): unlabeled_iterator = self._make_bucket_iterator( Dataset(ex, self.unlabeled_fields), batch_size=self.batch_size, device=self.device) yield unlabeled_iterator del unlabeled_iterator torch.cuda.empty_cache() def _endless_minibatch(self, data): while True: for i, batch in enumerate(data): yield batch def get_alternating_minibatch(self): # self._create_dataset() while True: for iter in self._endless_unlabeled(): for mb in iter: yield next(self._endless_minibatch(self.train)), "labeled" if self.semi_supervised: yield mb, "unlabeled" def get_input_sizes(self): return self.num_words, self.num_char, self.num_tags def get_pad_token_id(self): return self.char.vocab.stoi[self.char.pad_token] def get_unk_token_id(self): return self.char.vocab.stoi[self.char.unk_token] def get_train_sentences_length(self): return len(self.train_sentences)