def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000): text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True) def transform(caption): caption = caption.strip().lower().split() return caption dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform) train,val = dataset.split(split_ratio=split_ratio) test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform) print("Dataset loaded") print("Train set size:",len(train)) text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size) SOS_TOKEN = text_field.vocab.stoi['<sos>'] EOS_TOKEN = text_field.vocab.stoi['<eos>'] UNK_TOKEN = text_field.vocab.stoi['<unk>'] PAD_TOKEN = text_field.vocab.stoi['<pad>'] print("Vocabuly build") print("Vocabuly statistics") print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10)) print("Size of the vocabulary:",len(text_field.vocab)) print("Max sequence lenght",dataset.max_seq_len) train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size) test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False) vocab_dict = text_field.vocab.stoi return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field, "word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()}, "num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
def load_dataset(file_name): """Loads contents from a file in the *data* directory into a torchtext.data.TabularDataset instance. """ file_path = join(DATA_DIR, file_name) text_field = Field(pad_token=None, tokenize=_tokenize_str) dataset = TabularDataset( path=file_path, format='csv', fields=[('text', text_field)]) text_field.build_vocab(dataset) return dataset
def load_dataset(batch_size): spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) DE.build_vocab(train.src, min_freq=2) EN.build_vocab(train.trg, max_size=10000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN
spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@',text))] def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@',text))] data_path = "/home/martin/Documents/Datasets" #EN = Field(tokenize=tokenize_en,batch_first=True,init_token="<SOS>",eos_token="<EOS>") #DE = Field(tokenize=tokenize_de,batch_first=True,init_token="<SOS>",eos_token="<EOS>") EN = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<SOS>",eos_token="<EOS>") DE = Field(tokenize="spacy",tokenizer_language="de",batch_first=True,init_token="<SOS>",eos_token="<EOS>") # multi30k dataloader train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path) # wmt14 dataloader (better than using datasets.WMT14.splits since it's slow) #train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"), # train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000") print("Dataset loaded") EN.build_vocab(train.src,min_freq=3) DE.build_vocab(train.trg,max_size=50000) print("Vocabularies build")
return epoch_per / len(devLoader) ############################################################################### # Load data ############################################################################### print('load dataset') configfile = open('../config.yaml') config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader)) trainSet = TIMIT(config.data.data_root, mode='train') devSet = TIMIT(config.data.data_root, mode='test') TEXT = Field(lower=True, include_lengths=True, batch_first=True, unk_token=None) print('build vocab') sents = [ 'iy', 'ix', 'eh', 'ae', 'ax', 'uw', 'uh', 'ao', 'ey', 'ay', 'oy', 'aw', 'ow', 'er', 'l', 'r', 'w', 'y', 'm', 'n', 'ng', 'v', 'f', 'dh', 'th', 'z', 's', 'zh', 'jh', 'ch', 'b', 'p', 'd', 'dx', 't', 'g', 'k', 'hh', 'h#' ] sents = [[i] for i in sents] TEXT.build_vocab(sents, specials=['<blank>']) assert config.data.vocabSize == len(TEXT.vocab) assert config.data.pad_idx == TEXT.vocab.stoi['<pad>'] assert config.data.blank_idx == TEXT.vocab.stoi['<blank>'] def my_collate(batch):
def get_fields(src_data_type, n_src_feats, n_tgt_feats, pad='<blank>', bos='<s>', eos='</s>', dynamic_dict=False, src_truncate=None, tgt_truncate=None): """ Args: src_data_type: type of the source input. Options are [text|img|audio]. n_src_feats (int): the number of source features (not counting tokens) to create a :class:`torchtext.data.Field` for. (If ``src_data_type=="text"``, these fields are stored together as a ``TextMultiField``). n_tgt_feats (int): See above. pad (str): Special pad symbol. Used on src and tgt side. bos (str): Special beginning of sequence symbol. Only relevant for tgt. eos (str): Special end of sequence symbol. Only relevant for tgt. dynamic_dict (bool): Whether or not to include source map and alignment fields. src_truncate: Cut off src sequences beyond this (passed to ``src_data_type``'s data reader - see there for more details). tgt_truncate: Cut off tgt sequences beyond this (passed to :class:`TextDataReader` - see there for more details). Returns: A dict mapping names to fields. These names need to match the dataset example attributes. """ assert src_data_type in ['text', 'img', 'audio', 'vec', 'keyphrase'], \ "Data type not implemented" assert not dynamic_dict or src_data_type == 'text' or src_data_type == 'keyphrase', \ 'it is not possible to use dynamic_dict with non-text input' fields = {} fields_getters = { "text": text_fields, "img": image_fields, "audio": audio_fields, "vec": vec_fields, "keyphrase": text_fields } src_field_kwargs = { "n_feats": n_src_feats, "include_lengths": True, "pad": pad, "bos": None, "eos": None, "truncate": src_truncate, "base_name": "src" } fields["src"] = fields_getters[src_data_type](**src_field_kwargs) tgt_field_kwargs = { "n_feats": n_tgt_feats, "include_lengths": False, "pad": pad, "bos": bos, "eos": eos, "sep": keyphrase_dataset.SEP_token, "truncate": tgt_truncate, "base_name": "tgt" } # added by @memray, it might be smarter to add field_name to __init__ in the future if src_data_type == "keyphrase": fields['tgt'] = keyphrase_fields(**tgt_field_kwargs) else: fields['tgt'] = text_fields(**tgt_field_kwargs) indices = Field(use_vocab=False, dtype=torch.long, sequential=False) fields["indices"] = indices if dynamic_dict: src_map = Field(use_vocab=False, dtype=torch.float, postprocessing=make_src, sequential=False) fields["src_map"] = src_map src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab align = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["alignment"] = align # added by @memray, load some other meta information of each data example for keyphrase dataset if src_data_type == 'keyphrase': id = Field(use_vocab=False, dtype=torch.long, sequential=False) fields["id"] = id # for Orthogonal Regularization and Semantic Coverage sep_indices = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["sep_indices"] = sep_indices return fields
def tokenize_en(text): """ Tokenizes English text from a string into a list of strings (tokens) and reverses it """ return [tok.text for tok in spacy_en.tokenizer(text)][::-1] def tokenize_hi(text): """ Tokenizes Hindi text from a string into a list of strings (tokens) """ return text.split() SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_hi, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path='IITB_small', validation='dev', exts=('.en', '.hi'), fields=(SRC, TRG)) print(f"Number of training examples: {len(train_data.examples)}") print(f"Number of validation examples: {len(valid_data.examples)}")
# (train_data, valid_data, test_data), # batch_size=BATCH_SIZE, # sort_within_batch=True, # sort_key=lambda x: len(x.src), # device=device) ####################################### ####### test with invert ############## ####################################### SRC_TRN_PATH, TRG_TRN_PATH = 'toy-revert/src-train.txt', 'toy-revert/tgt-train.txt' SRC_VAL_PATH, TRG_VAL_PATH = 'toy-revert/src-val.txt', 'toy-revert/tgt-val.txt' SRC_TEST_PATH, TRG_TEST_PATH = 'toy-revert/src-test.txt', 'toy-revert/tgt-test.txt' TEXT = Field(tokenize="spacy", init_token='<sos>', eos_token='<eos>', include_lengths=True, lower=True) TRG_TEXT = Field(tokenize="spacy", init_token='<sos>', eos_token='<eos>', lower=True) from_txt_to_dataframe_and_csv('toy-revert', 'src-train.txt', 'tgt-train.txt', 'train') from_txt_to_dataframe_and_csv('toy-revert', 'src-val.txt', 'tgt-val.txt', 'val') from_txt_to_dataframe_and_csv('toy-revert', 'src-test.txt', 'tgt-test.txt', 'test')
from torchtext.data import Field import spacy # for tokenizer spacy_en = spacy.load('en_core_web_sm') spacy_de = spacy.load('de_core_news_sm') def tokenize_en(text): return [token.text for token in spacy_en.tokenizer(text)] def tokenize_de(text): return [token.text for token in spacy_de.tokenizer(text)] # load data SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) TRG = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train, valid, test = torchtext.datasets.WMT14.splits(exts=('.en', '.de'), fields=(SRC, TRG)) length = len(train.examples) src_sentences = [] trg_sentences = [] for i in range(length):
class SequenceDataLoader(CommonDataLoader): def __init__(self, data_config): super(SequenceDataLoader, self).__init__(data_config) self.__build_field() self._load_data() pass def __build_field(self): self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, include_lengths=True) self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True) self._fields = [ ('text', self.TEXT), ('tag', self.TAG) ] self._fields_test = [('text', self.TEXT)] pass @timeit def _load_data(self): self.train_data = REDataset(path=self._config.data.chip_relation.train_path, fields=self._fields) self.valid_data = REDataset(path=self._config.data.chip_relation.valid_path, fields=self._fields) self.test_data = REDataset(path=self._config.data.chip_relation.test_path, fields=self._fields_test) self.__build_vocab(self.train_data, self.valid_data, self.test_data) self.__build_iterator(self.train_data, self.valid_data, self.test_data) pass def __build_vocab(self, *dataset): """ :param dataset: train_data, valid_data, test_data :return: text_vocab, tag_vocab """ self.TEXT.build_vocab(*dataset) self.TAG.build_vocab(*dataset[:-1]) self.word_vocab = self.TEXT.vocab self.tag_vocab = self.TAG.vocab pass def __build_iterator(self, *dataset): self._train_iter = BucketIterator( dataset[0], batch_size=self._config.data.train_batch_size, shuffle=True, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._valid_iter = BucketIterator( dataset[1], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._test_iter = BucketIterator( dataset[2], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) def load_train(self): return self._train_iter pass def load_test(self): return self._test_iter pass def load_valid(self): return self._valid_iter pass
def main(opt): ArgumentParser.validate_train_opts(opt) ArgumentParser.update_model_opts(opt) ArgumentParser.validate_model_opts(opt) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) logger.info('Loading vocab from checkpoint at %s.' % opt.train_from) vocab = checkpoint['vocab'] else: vocab = torch.load(opt.data + '.vocab.pt') # check for code where vocab is saved instead of fields # (in the future this will be done in a smarter way) if old_style_vocab(vocab): fields = load_old_vocab( vocab, opt.model_type, dynamic_dict=opt.copy_attn) else: fields = vocab # @memray: a temporary workaround, as well as train_single.py line 78 if opt.model_type == "keyphrase": if opt.tgt_type in ["one2one", "multiple"]: del fields['sep_indices'] else: if 'sep_indices' not in fields: sep_indices = Field( use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["sep_indices"] = sep_indices if 'src_ex_vocab' not in fields: src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab if len(opt.data_ids) > 1: train_shards = [] for train_id in opt.data_ids: shard_base = "train_" + train_id train_shards.append(shard_base) train_iter = build_dataset_iter_multiple(train_shards, fields, opt) else: if opt.data_ids[0] is not None: shard_base = "train_" + opt.data_ids[0] else: shard_base = "train" train_iter = build_dataset_iter(shard_base, fields, opt) nb_gpu = len(opt.gpu_ranks) print(os.environ['PATH']) if opt.world_size > 1: queues = [] mp = torch.multiprocessing.get_context('spawn') semaphore = mp.Semaphore(opt.world_size * opt.queue_size) # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. procs = [] for device_id in range(nb_gpu): q = mp.Queue(opt.queue_size) queues += [q] procs.append(mp.Process(target=run, args=( opt, device_id, error_queue, q, semaphore), daemon=True)) procs[device_id].start() logger.info(" Starting process pid: %d " % procs[device_id].pid) error_handler.add_child(procs[device_id].pid) producer = mp.Process(target=batch_producer, args=(train_iter, queues, semaphore, opt,), daemon=True) producer.start() error_handler.add_child(producer.pid) for p in procs: p.join() producer.terminate() elif nb_gpu == 1: # case 1 GPU only single_main(opt, 0) else: # case only CPU single_main(opt, -1)
class ApacheDiffTokenHierarchical(ApacheDiffToken): NESTING_FIELD = Field(batch_first=True, tokenize=split_string) CODE_FIELD = NestedField(NESTING_FIELD, tokenize=split_json)
class ApacheDiffToken(TabularDataset): NAME = 'ApacheDiffToken' NUM_CLASSES = 3 IS_MULTILABEL = False REPO_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=remove_field) SHA_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=remove_field) CODE_FIELD = Field(batch_first=True, tokenize=split_json_string, include_lengths=True) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) @staticmethod def sort_key(ex): return len(ex.code) @classmethod def splits(cls, path, train=os.path.join('apache_diff_token', 'train_bal.tsv'), validation=os.path.join('apache_diff_token', 'dev_bal.tsv'), test=os.path.join('apache_diff_token', 'test_bal.tsv'), **kwargs): return super(ApacheDiffToken, cls).splits(path, train=train, validation=validation, test=test, format='tsv', fields=[('repo', cls.REPO_FIELD), ('sha', cls.SHA_FIELD), ('code', cls.CODE_FIELD), ('label', cls.LABEL_FIELD)]) @classmethod def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path) cls.CODE_FIELD.build_vocab(train, val, test, vectors=vectors) return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=True, device=device)
def __init__(self, config): # logger self.logger = logging.getLogger(config["name"]) # data loader params self.config = config["data_loader"]["args"] data_path = self.config["data_path"] self.data_path = data_path ensure_dir(data_path) self.train_path = os.path.join(data_path, self.config["train_file"]) self.valid_path = os.path.join(data_path, self.config["valid_file"]) self.test_path = os.path.join(data_path, self.config["test_file"]) # limit max text length self.context_threshold = self.config["context_threshold"] self.logger.info("preprocessing data files...") if not os.path.exists(self.train_path) or not os.path.exists( self.valid_path): self.preprocess(type="train") if not os.path.exists(self.test_path): self.preprocess(type="test") # define filed TEXT = Field(sequential=True, use_vocab=True, tokenize=lambda x: x, lower=True, include_lengths=True, batch_first=True) LABLE = LabelField(sequential=False, use_vocab=False) # build dataset self.logger.info("building dataset......") train_dict_fileds = {'text': ('text', TEXT), 'label': ('label', LABLE)} self.train, self.valid, self.test = TabularDataset.splits( path=data_path, # data root path format="json", train=self.config["train_file"], validation=self.config["valid_file"], test=self.config["test_file"], fields=train_dict_fileds) # build vocab self.logger.info("building vocab....") TEXT.build_vocab(self.train, self.valid, self.test) # load pretrained embeddings self.logger.info("load pretrained embeddings...") Vectors = vocab.Vectors(self.config["pretrain_emd_file"]) TEXT.vocab.load_vectors(Vectors) # just for call easy self.vocab = TEXT.vocab # build iterators self.logger.info("building iterators.....") self.train_iter, self.valid_iter = BucketIterator.splits( (self.train, self.valid), batch_sizes=(self.config["train_batch_size"], self.config["valid_batch_size"]), device=self.config["device"], sort_key=lambda x: len(x.text), sort_within_batch=False) self.test_iter = BucketIterator( self.test, batch_size=self.config["test_batch_size"], device=self.config["device"], sort_key=lambda x: len(x.text), sort=False, sort_within_batch=False) self.logger.info("building iterators done!") self.logger.info( "Total train data set is: {}, valid data set is: {}, test " "data is: {}".format(len(self.train), len(self.valid), len(self.test)))
def getData(): # filename = '.data/multi30k/train.en' # generate_sp_model(filename, vocab_size=8000, model_type='bpe', model_prefix='zaid_sp_model') # #s = spm.SentencePieceProcessor(model_file='zaid_sp_model.model') # print(vars(s)) # print(dir(s)) # print(s.vocab_size()) # print(s.bos_id())#exit() # print(s.eos_id()) # print(s.unk_id()) # print(s.pad_id()) #exit() sp_gec = load_sp_model("BPE/GCEBPE30k.model") #sp_gec = load_sp_model("zaid_sp_model.model") # sp_gec =s # print(dir(sp_gec)) # print(vars(sp_gec)) #exit() src_pad_idx = sp_gec.pad_id() #english_vocab.stoi["<pad>"] print("pad_index = ", src_pad_idx) # print("pad = ", sp_gec.decode(src_pad_idx)) #exit() # print("print(len(sp_gec)) 1", len(sp_gec)) # print(vars(sp_gec)) # print(dir(sp_gec)) #exit() bpe_field = Field(use_vocab=False, tokenize=sp_gec.encode, init_token=sp_gec.bos_id(), eos_token=sp_gec.eos_id(), pad_token=sp_gec.pad_id(), unk_token=sp_gec.unk_id(), batch_first=True) print("-----------------------------------------------") #print(TabularDataset.splits.__doc__) #tv_datafields = [("ignore", bpe_field), ("trg", bpe_field), ("src", bpe_field)] # train_data, valid_data, test_data = TabularDataset.splits(path = "/data/chaudhryz/ankit", train = "test10k.tsv", # validation="test10k.tsv", test = "test10k.tsv", format='tsv', skip_header=False, fields = tv_datafields) tv_datafields = [("trg", bpe_field), ("src", bpe_field)] train_data, valid_data, test_data = TabularDataset.splits( path=".data/multi30k", train="train.tsv", validation="val.tsv", test="test2016.tsv", format='tsv', skip_header=False, fields=tv_datafields) # train_data, valid_data, test_data = Multi30k.splits( # exts=(".ennsw", ".en"), fields=tv_datafields, # train='train', # validation='val', # test='test2016', # path = '.data/multi30k' # ) print(train_data) return sp_gec, train_data, valid_data, test_data
train, valid, test = data.SemevalDataset.splits( TEXT, ASPECT, SENTIMENT, flat=args.flat_data, path=args.filepath, train="acsa_train.json.train", validation="acsa_train.json.valid", test="acsa_test.json", ) data.build_vocab(TEXT, ASPECT, SENTIMENT, train, valid, test) elif args.data == "sst": TEXT, SENTIMENT = ( Field(tokenize="spacy", lower=True, include_lengths=True, batch_first=True, init_token="<bos>", eos_token="<eos>"), Field( lower=True, is_target=True, unk_token=None, pad_token=None, batch_first=True, ), ) train, valid, test = torchtext.datasets.SST.splits( TEXT, SENTIMENT, fine_grained=args.fine_grained, train_subtrees=args.train_subtrees,
pattern = re.compile(r"[ \n\t]+") text = pattern.sub(" ", text) text = "".join("".join(s)[:2] for _, s in itertools.groupby(text)) text = re.sub(r'[^A-Za-z0-9,?.!]+', ' ', text) return text.strip() nlp = spacy.load('en', disable=['parser', 'tagger', 'ner']) def tokenizer(s): return [w.text.lower() for w in nlp(tweet_clean(s))] TEXT = Field(sequential=True, tokenize=tokenizer, include_lengths=True, use_vocab=True) TARGET = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None, is_target=False) data_fields = [(None, None), ("tweet", TEXT), ("target", TARGET)] def split_train_test(df, test_size=0.2): train, val = train_test_split(df, test_size=test_size, random_state=42) return train.reset_index(drop=True), val.reset_index(drop=True)
# Tokenizers def tokenize_eng(text): return [tok for tok in eng_tokenizer.encode(text).tokens] def tokenize_lit(text): return [tok for tok in lt_tokenizer.encode(text).tokens] # Create Fields english = Field( sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>", ) lithuanian = Field( sequential=True, use_vocab=True, tokenize=tokenize_lit, lower=True, init_token="<sos>", eos_token="<eos>", ) fields = {"Lithuanian": ("src", lithuanian), "English": ("trg", english)} # Convert into Tabular Dataset
def main(): data_dir = "/home/donchan/Documents/DATA/jigsaw" start_t = time() vec = vocab.Vectors('glove.6B.100d.txt', '/home/donchan/Documents/DATA/glove_embedding/') TEXT = Field(sequential=True, tokenize=tokenizer2, lower=True) LABEL = Field(sequential=False, use_vocab=False) datafields = [("id", None), # we won't be needing the id, so we pass in None as the field ("comment_text", TEXT), ("toxic", LABEL), ("severe_toxic", LABEL), ("obscene", LABEL), ("threat", LABEL), ("insult", LABEL), ("identity_hate", LABEL)] train, val = TabularDataset.splits(path=data_dir, train='traindf.csv', validation='valdf.csv', format='csv', skip_header=True, fields=datafields) print("train val length", len(train), len(val)) #print( train[0].comment_text ) #print( train[0].toxic, train[0].severe_toxic, train[0].threat, train[0].insult, train[0].identity_hate ) TEXT.build_vocab(train, val, vectors=vec, min_freq=2 ) #LABEL.build_vocab(train, val) print("time to build vocab", (time() - start_t)) print("length of vocaburary", len(TEXT.vocab), TEXT.vocab.vectors.shape ) print("- "*20 ) print("* most common words.") print( TEXT.vocab.freqs.most_common(20) ) train_iter, val_iter = BucketIterator.splits( (train, val), # we pass in the datasets we want the iterator to draw data from batch_sizes=(batch_size,batch_size), device=torch.device("cuda"), # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=False, repeat=False # we pass repeat=False because we want to wrap this Iterator layer. ) train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]) valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]) x,y = next( iter( train_dl ) ) em_sz = 100 nh = 500 nl = 3 model_file = os.path.join(data_dir, "jigsaw_model_7978.pkl") model = SimpleBiLSTMBaseline( hidden_dim=nh,emb_dim=em_sz,len_TEXT_vocab=len(TEXT.vocab), v_vec=TEXT.vocab.vectors ) if os.path.isfile( model_file ): print("model file found.", ) model.load_state_dict( torch.load( model_file ) ) #model = dill.load(open(model_file,"rb")) #model = torch model.cuda() opt = optim.Adam(model.parameters(), lr=1e-4) loss_func = nn.BCEWithLogitsLoss() epochs = 10 for epoch in range(1, epochs + 1): running_loss = 0.0 #running_corrects = 0 model.train() # turn on training mode for idx, (x, y) in enumerate( tqdm.tqdm( train_dl ) ): # thanks to our wrapper, we can intuitively iterate over our data! opt.zero_grad() preds = model(x) loss = loss_func(preds, y) loss.backward() opt.step() #if idx % 100 == 0: # print("- "*20) # print("step",idx) # print("preds", preds) # print("loss %.5f" % loss.item()) running_loss += loss.item() * x.size(0) epoch_loss = running_loss / len(train) # calculate the validation loss for this epoch val_loss = 0.0 accs = [] model.eval() # turn on evaluation mode for x, y in valid_dl: preds = model(x) loss = loss_func(preds, y) val_loss += loss.item() * x.size(0) logits = preds.cpu().data.numpy() logits = 1. / (1. + np.exp( -logits )) z = np.zeros_like(logits) z[ logits > .5 ] = 1 y_num = y.cpu().data.numpy() acc = (z == y_num).sum() / ( y_num.shape[0] * y_num.shape[1] ) accs.append( acc ) val_loss /= len(val) print("mean accuracy", np.mean( accs ) ) print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss)) #dill.dump(model, open("jigsaw_model.pkl","wb")) torch.save(model.state_dict(), os.path.join(data_dir, "jigsaw_model_%d.pkl" % epoch ) )
stopwords = stopwordslist('/root/news/stopword.txt') # 这里加载停用词的路径 return [word for word in jieba.cut(text) if word.strip() not in stopwords] #使用jieba做中文分词并且加载停用词 #加载停用词词库 def stopwordslist(filepath): stopwords = [ line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines() ] return stopwords #Field类处理数据 TEXT = Field(sequential=True, tokenize=tokenizer, fix_length=200) #使用了分词方法tokenizer LABEL = Field(sequential=False) tv_datafields = [("text", TEXT), ("label", LABEL)] # In[2]: import re import os def CreateDataSet(root): # 定义正则表达式 patternUrl = re.compile(r'<url>(.*?)</url>', re.S) patternContent = re.compile(r'<content>(.*?)</content>', re.S) contents_list = []
def get_fields(src_data_type, n_src_feats, n_tgt_feats, pad='<blank>', eos='</s>', bos='<s>', dynamic_dict=False, with_align=False, src_truncate=None, tgt_truncate=None): """ Args: src_data_type: type of the source input. Options are [text|img|audio]. n_src_feats (int): the number of source features (not counting tokens) to create a :class:`torchtext.data.Field` for. (If ``src_data_type=="text"``, these fields are stored together as a ``TextMultiField``). n_tgt_feats (int): See above. pad (str): Special pad symbol. Used on src and tgt side. bos (str): Special beginning of sequence symbol. Only relevant for tgt. eos (str): Special end of sequence symbol. Only relevant for tgt. dynamic_dict (bool): Whether or not to include source map and alignment fields. with_align (bool): Whether or not to include word align. src_truncate: Cut off src sequences beyond this (passed to ``src_data_type``'s data reader - see there for more details). tgt_truncate: Cut off tgt sequences beyond this (passed to :class:`TextDataReader` - see there for more details). Returns: A dict mapping names to fields. These names need to match the dataset example attributes. """ assert src_data_type in ['text', 'img', 'audio', 'vec'], \ "Data type not implemented" assert not dynamic_dict or src_data_type == 'text', \ 'it is not possible to use dynamic_dict with non-text input' fields = {} fields_getters = { "text": text_fields, "img": image_fields, "audio": audio_fields, "vec": vec_fields } src_field_kwargs = { "n_feats": n_src_feats, "include_lengths": True, "pad": pad, "bos": None, "eos": None, "truncate": src_truncate, "base_name": "src" } fields["src"] = fields_getters[src_data_type](**src_field_kwargs) tgt_field_kwargs = { "n_feats": n_tgt_feats, "include_lengths": True, "pad": pad, "bos": bos, "eos": eos, "truncate": tgt_truncate, "base_name": "tgt" } fields["tgt"] = fields_getters["text"](**tgt_field_kwargs) indices = Field(use_vocab=False, dtype=torch.long, sequential=False) fields["indices"] = indices corpus_ids = Field(use_vocab=True, sequential=False) fields["corpus_id"] = corpus_ids if dynamic_dict: src_map = Field(use_vocab=False, dtype=torch.float, postprocessing=make_src, sequential=False) fields["src_map"] = src_map src_ex_vocab = RawField() fields["src_ex_vocab"] = src_ex_vocab align = Field(use_vocab=False, dtype=torch.long, postprocessing=make_tgt, sequential=False) fields["alignment"] = align if with_align: word_align = AlignField() fields["align"] = word_align return fields
from typing import Tuple from torch import Tensor from torchtext.datasets import Multi30k from torchtext.data import Field, BucketIterator from utils.config import DEVICE from utils.utils import caculate_accuracy except ImportError as e: print(e) raise ImportError # ======================= prepare data ======================= # SRC = Field(tokenize='spacy', tokenizer_language='de_core_news_sm', init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) BATCH_SIZE = 128
def build_dataset_and_vocab(sentences: List[str]): """ Define source and target fields, iterate over the list of sentences to create list of Examples, and return: - training and validation dataset (split 90-10%) - source and target fields with Vocab object """ # Minimum and maximum length for sentences to be included in the dataset min_length, max_length = 4, 10 # Define source and target fields bos_word = '<s>' eos_word = '</s>' pad_word = '<pad>' src_field = Field(tokenize=tokenize_en, pad_token=pad_word, lower=True) tgt_field = Field(tokenize=tokenize_en, init_token=bos_word, eos_token=eos_word, pad_token=pad_word, lower=True) # Create list of Examples from the list of sentences examples = [] sent_count = 0 for sentence in sentences: sentence_split = sentence.split(' ') sentence_length = len(sentence_split) if sentence_length <= min_length or sentence_length >= max_length: continue sent_count += 1 # If sent length is less than 8 if sentence_length <= min_length + 4: # Src length is 3 src_length = min_length - 1 else: # Src length is 5 src_length = min_length + 1 for i in range(0, sentence_length - src_length, src_length): src = ' '.join(sentence_split[i:i + src_length]) tgt = ' '.join(sentence_split[i + src_length:]) example = Example.fromlist(data=[src, tgt], fields=[('src', src_field), ('tgt', tgt_field)]) examples.append(example) print( f'Total {sent_count} sentences processed into {len(examples)} examples.' ) train_dataset, valid_dataset = Dataset(examples=examples, fields=[ ('src', src_field), ('tgt', tgt_field) ]).split(split_ratio=[0.9, 0.1]) # Set the minimum frequency needed to include a token in the vocabulary min_freq = 2 src_field.build_vocab(train_dataset, min_freq=min_freq) tgt_field.build_vocab(train_dataset, min_freq=min_freq) return train_dataset, valid_dataset, src_field, tgt_field
from torchtext.data import Field,BucketIterator import spacy import random import torch.optim as opt device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') device eng=spacy.load('en') ger=spacy.load('de_core_news_sm') def Tokenize_eng(text): return [a.text for a in eng.tokenizer(text)] def Tokenize_german(text): return [b.text for b in ger.tokenizer(text)] german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>') english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>') Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english)) german.build_vocab(Train,max_size=10000,min_freq=2) english.build_vocab(Train,max_size=10000,min_freq=2) ##building encoder class Encode(Module): def __init__(self,inp_size,emd_size,hidden_size): super(Encode,self).__init__() self.inp_size=inp_size self.emd_size=emd_size self.hidden_size=hidden_size self.embed=Embedding(self.inp_size,self.emd_size)
spacy_ger = spacy.load("de_core_news_sm") spacy_eng = spacy.load("en_core_web_sm") def tokenizer_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text) ] #'hello my name is' -> ['hello', 'my', 'name', 'is'] def tokenizer_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text) ] #'hello my name is' -> ['hello', 'my', 'name', 'is'] german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenizer_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, validation_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2)
return [tok.text for tok in spacy_de.tokenizer(text) ][::-1] # list[::-1] used to reverse the list def tokenize_en(text): # tokenizes the english text into a list of strings(tokens) return [tok.text for tok in spacy_en.tokenizer(text)] # torchtext's Field handle how the data should be processed. For more refer: https://github.com/pytorch/text # use the tokenize_de, tokenize_en for tokenization of german and english sentences. # German is the src, English is the trg # append the <sos> (start of sentence), <eos> (end of sentence) tokens to all sentences. SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) # we will be using Multi30k dataset. This is a dataset with ~30K parallel English, German, French sentences. # exts specifies which languages to use as source and target. source goes first # fields define which data processing to apply for source and target train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) print('Loaded data...') # build the vocab
def translate(cfg_file, ckpt: str, output_path: str = None) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output file """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data cfg = load_config(cfg_file) speech_mode = cfg.get("speech", True) if speech_mode: raise NotImplementedError( "Translation mode isn't implemented for speech processing yet.") logger = make_logger() def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha, logger=logger) return hypotheses # when checkpoint is not specified, take oldest from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) batch_size = cfg["training"].get("eval_batch_size", cfg["training"].get("batch_size", 1)) batch_type = cfg["training"].get( "eval_batch_type", cfg["training"].get("batch_type", "sentence")) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) # read vocabs src_vocab_file = cfg["data"].get( "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt") trg_vocab_file = cfg["data"].get( "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] def tok_fun(s): return list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, <2: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 1) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 1 beam_alpha = -1 if not sys.stdin.isatty(): # input file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) hypotheses = _translate_data(test_data) if output_path is not None: # write to outputfile if given output_path_set = "{}".format(output_path) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s.", output_path_set) else: # print to stdout for hyp in hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 batch_type = "sentence" while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: {}".format(hypotheses[0])) except (KeyboardInterrupt, EOFError): print("\nBye.") break
test.to_csv("data/test.csv", index=False) spacy_eng = spacy.load("en_core_web_sm") spacy_gem = spacy.load("de_core_news_sm") def english_tokenizer(text): return [tok.text for tok in spacy_eng.tokenizer(text)] def german_tokenizer(text): return [tok.text for tok in spacy_gem.tokenizer(text)] english = Field(sequential=True, use_vocab=True, tokenize=english_tokenizer, lower=True) german = Field(sequential=True, use_vocab=True, tokenize=german_tokenizer, lower=True) fields = {"english": ("eng", english), "german": ("ger", german)} train_data, test_data = TabularDataset.splits(path="", train="data/train_lang.json", test="data/test_lang.json", format="json", fields=fields) english.build_vocab(train_data, max_size=10000, min_freq=2) german.build_vocab(train_data, max_size=10000, min_freq=2)
DATA_PATH = './data' #os.environ['DATA_PATH'] tagger = Mecab() USE_CUDA = torch.cuda.is_available() DEVICE = 'cuda' if USE_CUDA else 'cpu' def pad_under_five(toknized): """ 모델에서 5-gram 단위 필터를 사용하기 때문에 5-gram이 안되는 문장에 <pad>로 채워준다 """ if len(toknized) < 5: toknized.extend(["<pad>"]*(5-len(toknized))) return toknized TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five) LABEL = Field(sequential=False,use_vocab=True,unk_token=None) train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/', train='ratings_train.txt', test='ratings_test.txt', format='tsv', skip_header=True, fields=[('id',None),('text',TEXT),('label',LABEL)], filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용 TEXT.build_vocab(train_data,min_freq=2) LABEL.build_vocab(train_data) # print (TEXT.vocab)
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) trainer.accelerator_backend = GPUAccelerator(trainer) # non-transferrable types primitive_objects = [ None, {}, [], 1.0, "x", [None, 2], { "x": (1, 2), "y": None } ] for batch in primitive_objects: data = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type( ) == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type( ) == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type( ) == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type( ) == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type( ) == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{ 'a': torch.rand(2, 3), 'b': torch.rand(2, 3) } for _ in range(2)]) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [ BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2) ] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator_backend.batch_to_device( CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
# data path data_path = "/projdata3/info_fil/olatunji/NLP/Dataset/ARD Amazon/Processed data/" data_path2 = "/projdata3/info_fil/olatunji/NLP/Dataset/ARD Amazon/Human annotation/" log_interval = 10 # 'how many steps to wait before logging training status [default: 1]') test_interval = 100 # 'how many steps to wait before testing [default: 100]') else: device = "cpu" # Data batch size batch_size = 64 # 'batch size for training [default: 64]') data_path = "C:/Users/hpuser/Documents/Python Sandbox/NLP/New folder/" data_path2 = "C:/Users/hpuser/Documents/Python Sandbox/NLP/New folder/" log_interval = 1 # 'how many steps to wait before logging training status [default: 1]') test_interval = 2 # 'how many steps to wait before testing [default: 100]') tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True) LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float32) tv_datafields = [("helpful", None), ("xofyHelpfulScore", LABEL), ("overall", None), ("reviewText", TEXT)] trn, vld = TabularDataset.splits( path=data_path, # the root directory where the data lies train='validdatagg.csv', validation="validdatagg.csv", format='csv', skip_header=True, fields=tv_datafields) # print(trn[0].__dict__.keys()) # print(trn[0].reviewText[:3])
class IMDBHierarchical(IMDB_stanford): NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=Sentence_Tokenize())
def create_dataset( config: Config, device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]: fields = dict() fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, RawField()) time_field = Field(use_vocab=False, batch_first=True, sequential=False) fields['jst_hour'] = (SeqType.Time.value, time_field) token_field = \ Field(use_vocab=True, init_token=SpecialToken.BOS.value, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) \ if config.use_init_token_tag \ else Field(use_vocab=True, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) fields['processed_tokens'] = (SeqType.Token.value, token_field) seqtypes = [ SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort, SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong, SeqType.StdShort, SeqType.StdLong ] tensor_type = torch.FloatTensor if device.type == 'cpu' else torch.cuda.FloatTensor for (ric, seqtype) in itertools.product(config.rics, seqtypes): n = N_LONG_TERM \ if seqtype.value.endswith('long') \ else N_SHORT_TERM price_field = Field(use_vocab=False, fix_length=n, batch_first=True, pad_token=0.0, preprocessing=lambda xs: [float(x) for x in xs], tensor_type=tensor_type) key = stringify_ric_seqtype(ric, seqtype) fields[key] = (key, price_field) train, val, test = \ TabularDataset.splits(path=str(config.dir_output), format='json', train='alignment-train.json', validation='alignment-valid.json', test='alignment-test.json', fields=fields) token_field.build_vocab(train, min_freq=config.token_min_freq) batch_size = config.batch_size train_iter, val_iter, test_iter = \ Iterator.splits((train, val, test), batch_sizes=(batch_size, batch_size, batch_size), device=-1 if device.type == 'cpu' else device, repeat=False, sort=False) return (token_field.vocab, train_iter, val_iter, test_iter)
class IMDB_stanford(TabularDataset): NAME = 'IMDB_stanford' NUM_CLASSES = 2 IS_MULTILABEL = False TEXT_FIELD = Field(batch_first=True, tokenize=clean_string) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) @staticmethod def sort_key(ex): return len(ex.text) @classmethod def splits(cls, path, train=os.path.join('IMDB_stanford', 'train.tsv'), validation=os.path.join('IMDB_stanford', 'dev.tsv'), test=os.path.join('IMDB_stanford', 'test.tsv'), **kwargs): return super(IMDB_stanford, cls).splits(path, train=train, validation=validation, test=test, format='tsv', fields=[('label', cls.LABEL_FIELD), ('text', cls.TEXT_FIELD)]) @classmethod def iters(cls, path, vectors_name=None, vectors_cache=None, batch_size=64, shuffle=True, device=0, vectors=None, unk_init=torch.Tensor.zero_, onehot_Flag=False, max_size=None, sort_within_batch=False, bucket_size=300): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_cache: path to directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ if vectors is None and not onehot_Flag: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) if max_size is not None: max_size = max_size - 2 train, val, test = cls.splits(path) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors, max_size=max_size) return Less_padding_bucket_Iterator.splits( (train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, sort_within_batch=sort_within_batch, device=device, bucket_size=bucket_size)
from .config import DEVICE, DEFAULT_CONFIG seed = 2019 torch.manual_seed(seed) torch.cuda.manual_seed(seed) def light_tokenize(sequence: str): return [sequence] def post_process(arr, _): return [[int(item) for item in arr_item] for arr_item in arr] TEXT = Field(sequential=True, tokenize=light_tokenize, include_lengths=True) POS = Field(sequential=True, tokenize=light_tokenize) REL = Field(sequential=True, use_vocab=False, unk_token=None, pad_token=0, postprocessing=post_process) TAG = Field(sequential=True, tokenize=light_tokenize, is_target=True, unk_token=None) Fields = [('text', TEXT), ('pos', POS), ('rel', REL), ('tag', TAG)] class SRLTool(Tool): def get_dataset(self, path: str, fields=Fields, separator='\t'):