def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000):
	text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True)

	def transform(caption):
		caption = caption.strip().lower().split()
		return caption

	dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform)
	train,val = dataset.split(split_ratio=split_ratio)
	test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform)

	print("Dataset loaded")
	print("Train set size:",len(train))

	text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size)
	SOS_TOKEN = text_field.vocab.stoi['<sos>']
	EOS_TOKEN = text_field.vocab.stoi['<eos>']
	UNK_TOKEN = text_field.vocab.stoi['<unk>']
	PAD_TOKEN = text_field.vocab.stoi['<pad>']

	print("Vocabuly build")

	print("Vocabuly statistics")

	print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10))
	print("Size of the vocabulary:",len(text_field.vocab))
	print("Max sequence lenght",dataset.max_seq_len)

	train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size)
	test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False)
	vocab_dict = text_field.vocab.stoi
	return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field,
	"word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()},
	"num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
Пример #2
0
def load_dataset(file_name):
    """Loads contents from a file in the *data* directory into a
    torchtext.data.TabularDataset instance.
    """
    file_path = join(DATA_DIR, file_name)
    text_field = Field(pad_token=None, tokenize=_tokenize_str)

    dataset = TabularDataset(
        path=file_path,
        format='csv',
        fields=[('text', text_field)])

    text_field.build_vocab(dataset)
    return dataset
Пример #3
0
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')

def tokenize_en(text):
	return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@',text))]

def tokenize_de(text):
	return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@',text))]

data_path = "/home/martin/Documents/Datasets"

#EN = Field(tokenize=tokenize_en,batch_first=True,init_token="<SOS>",eos_token="<EOS>")
#DE = Field(tokenize=tokenize_de,batch_first=True,init_token="<SOS>",eos_token="<EOS>")
EN = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<SOS>",eos_token="<EOS>")
DE = Field(tokenize="spacy",tokenizer_language="de",batch_first=True,init_token="<SOS>",eos_token="<EOS>")

# multi30k dataloader
train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path)

# wmt14 dataloader (better than using datasets.WMT14.splits since it's slow)
#train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"),
#	train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000")

print("Dataset loaded")

EN.build_vocab(train.src,min_freq=3)
DE.build_vocab(train.trg,max_size=50000)

print("Vocabularies build")
    return epoch_per / len(devLoader)


###############################################################################
# Load data
###############################################################################
print('load dataset')
configfile = open('../config.yaml')
config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader))

trainSet = TIMIT(config.data.data_root, mode='train')
devSet = TIMIT(config.data.data_root, mode='test')

TEXT = Field(lower=True,
             include_lengths=True,
             batch_first=True,
             unk_token=None)
print('build vocab')
sents = [
    'iy', 'ix', 'eh', 'ae', 'ax', 'uw', 'uh', 'ao', 'ey', 'ay', 'oy', 'aw',
    'ow', 'er', 'l', 'r', 'w', 'y', 'm', 'n', 'ng', 'v', 'f', 'dh', 'th', 'z',
    's', 'zh', 'jh', 'ch', 'b', 'p', 'd', 'dx', 't', 'g', 'k', 'hh', 'h#'
]
sents = [[i] for i in sents]
TEXT.build_vocab(sents, specials=['<blank>'])
assert config.data.vocabSize == len(TEXT.vocab)
assert config.data.pad_idx == TEXT.vocab.stoi['<pad>']
assert config.data.blank_idx == TEXT.vocab.stoi['<blank>']


def my_collate(batch):
Пример #6
0
def get_fields(src_data_type,
               n_src_feats,
               n_tgt_feats,
               pad='<blank>',
               bos='<s>',
               eos='</s>',
               dynamic_dict=False,
               src_truncate=None,
               tgt_truncate=None):
    """
    Args:
        src_data_type: type of the source input. Options are [text|img|audio].
        n_src_feats (int): the number of source features (not counting tokens)
            to create a :class:`torchtext.data.Field` for. (If
            ``src_data_type=="text"``, these fields are stored together
            as a ``TextMultiField``).
        n_tgt_feats (int): See above.
        pad (str): Special pad symbol. Used on src and tgt side.
        bos (str): Special beginning of sequence symbol. Only relevant
            for tgt.
        eos (str): Special end of sequence symbol. Only relevant
            for tgt.
        dynamic_dict (bool): Whether or not to include source map and
            alignment fields.
        src_truncate: Cut off src sequences beyond this (passed to
            ``src_data_type``'s data reader - see there for more details).
        tgt_truncate: Cut off tgt sequences beyond this (passed to
            :class:`TextDataReader` - see there for more details).

    Returns:
        A dict mapping names to fields. These names need to match
        the dataset example attributes.
    """

    assert src_data_type in ['text', 'img', 'audio', 'vec', 'keyphrase'], \
        "Data type not implemented"
    assert not dynamic_dict or src_data_type == 'text' or src_data_type == 'keyphrase', \
        'it is not possible to use dynamic_dict with non-text input'
    fields = {}

    fields_getters = {
        "text": text_fields,
        "img": image_fields,
        "audio": audio_fields,
        "vec": vec_fields,
        "keyphrase": text_fields
    }

    src_field_kwargs = {
        "n_feats": n_src_feats,
        "include_lengths": True,
        "pad": pad,
        "bos": None,
        "eos": None,
        "truncate": src_truncate,
        "base_name": "src"
    }
    fields["src"] = fields_getters[src_data_type](**src_field_kwargs)

    tgt_field_kwargs = {
        "n_feats": n_tgt_feats,
        "include_lengths": False,
        "pad": pad,
        "bos": bos,
        "eos": eos,
        "sep": keyphrase_dataset.SEP_token,
        "truncate": tgt_truncate,
        "base_name": "tgt"
    }
    # added by @memray, it might be smarter to add field_name to __init__ in the future
    if src_data_type == "keyphrase":
        fields['tgt'] = keyphrase_fields(**tgt_field_kwargs)
    else:
        fields['tgt'] = text_fields(**tgt_field_kwargs)

    indices = Field(use_vocab=False, dtype=torch.long, sequential=False)
    fields["indices"] = indices

    if dynamic_dict:
        src_map = Field(use_vocab=False,
                        dtype=torch.float,
                        postprocessing=make_src,
                        sequential=False)
        fields["src_map"] = src_map

        src_ex_vocab = RawField()
        fields["src_ex_vocab"] = src_ex_vocab

        align = Field(use_vocab=False,
                      dtype=torch.long,
                      postprocessing=make_tgt,
                      sequential=False)
        fields["alignment"] = align

    # added by @memray, load some other meta information of each data example for keyphrase dataset
    if src_data_type == 'keyphrase':
        id = Field(use_vocab=False, dtype=torch.long, sequential=False)
        fields["id"] = id

        # for Orthogonal Regularization and Semantic Coverage
        sep_indices = Field(use_vocab=False,
                            dtype=torch.long,
                            postprocessing=make_tgt,
                            sequential=False)
        fields["sep_indices"] = sep_indices

    return fields
Пример #7
0
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_en.tokenizer(text)][::-1]


def tokenize_hi(text):
    """
    Tokenizes Hindi text from a string into a list of strings (tokens) 
    """
    return text.split()


SRC = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=tokenize_hi,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

train_data, valid_data, test_data = TranslationDataset.splits(
    path='IITB_small',
    validation='dev',
    exts=('.en', '.hi'),
    fields=(SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
    #     (train_data, valid_data, test_data),
    #     batch_size=BATCH_SIZE,
    #     sort_within_batch=True,
    #     sort_key=lambda x: len(x.src),
    #     device=device)

    #######################################
    ####### test with invert ##############
    #######################################
    SRC_TRN_PATH, TRG_TRN_PATH = 'toy-revert/src-train.txt', 'toy-revert/tgt-train.txt'
    SRC_VAL_PATH, TRG_VAL_PATH = 'toy-revert/src-val.txt', 'toy-revert/tgt-val.txt'
    SRC_TEST_PATH, TRG_TEST_PATH = 'toy-revert/src-test.txt', 'toy-revert/tgt-test.txt'

    TEXT = Field(tokenize="spacy",
                 init_token='<sos>',
                 eos_token='<eos>',
                 include_lengths=True,
                 lower=True)

    TRG_TEXT = Field(tokenize="spacy",
                     init_token='<sos>',
                     eos_token='<eos>',
                     lower=True)

    from_txt_to_dataframe_and_csv('toy-revert', 'src-train.txt',
                                  'tgt-train.txt', 'train')
    from_txt_to_dataframe_and_csv('toy-revert', 'src-val.txt', 'tgt-val.txt',
                                  'val')
    from_txt_to_dataframe_and_csv('toy-revert', 'src-test.txt', 'tgt-test.txt',
                                  'test')
Пример #9
0
from torchtext.data import Field
import spacy # for tokenizer

spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]

def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]

# load data
SRC = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TRG = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

train, valid, test = torchtext.datasets.WMT14.splits(exts=('.en', '.de'),
                                                     fields=(SRC, TRG))
length = len(train.examples)
src_sentences = []
trg_sentences = []
for i in range(length):
Пример #10
0
class SequenceDataLoader(CommonDataLoader):

    def __init__(self, data_config):
        super(SequenceDataLoader, self).__init__(data_config)
        self.__build_field()
        self._load_data()

        pass

    def __build_field(self):
        self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, include_lengths=True)
        self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True)
        self._fields = [
            ('text', self.TEXT), ('tag', self.TAG)
        ]
        self._fields_test = [('text', self.TEXT)]
        pass

    @timeit
    def _load_data(self):
        self.train_data = REDataset(path=self._config.data.chip_relation.train_path, fields=self._fields)
        self.valid_data = REDataset(path=self._config.data.chip_relation.valid_path, fields=self._fields)
        self.test_data = REDataset(path=self._config.data.chip_relation.test_path, fields=self._fields_test)
        self.__build_vocab(self.train_data, self.valid_data, self.test_data)
        self.__build_iterator(self.train_data, self.valid_data, self.test_data)
        pass

    def __build_vocab(self, *dataset):
        """
        :param dataset: train_data, valid_data, test_data
        :return: text_vocab, tag_vocab
        """
        self.TEXT.build_vocab(*dataset)
        self.TAG.build_vocab(*dataset[:-1])
        self.word_vocab = self.TEXT.vocab
        self.tag_vocab = self.TAG.vocab
        pass

    def __build_iterator(self, *dataset):
        self._train_iter = BucketIterator(
            dataset[0], batch_size=self._config.data.train_batch_size, shuffle=True,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

        self._valid_iter = BucketIterator(
            dataset[1], batch_size=self._config.data.train_batch_size, shuffle=False,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

        self._test_iter = BucketIterator(
            dataset[2], batch_size=self._config.data.train_batch_size, shuffle=False,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

    def load_train(self):
        return self._train_iter
        pass

    def load_test(self):
        return self._test_iter
        pass

    def load_valid(self):
        return self._valid_iter
        pass
Пример #11
0
def main(opt):
    ArgumentParser.validate_train_opts(opt)
    ArgumentParser.update_model_opts(opt)
    ArgumentParser.validate_model_opts(opt)

    # Load checkpoint if we resume from a previous training.
    if opt.train_from:
        logger.info('Loading checkpoint from %s' % opt.train_from)
        checkpoint = torch.load(opt.train_from,
                                map_location=lambda storage, loc: storage)
        logger.info('Loading vocab from checkpoint at %s.' % opt.train_from)
        vocab = checkpoint['vocab']
    else:
        vocab = torch.load(opt.data + '.vocab.pt')

    # check for code where vocab is saved instead of fields
    # (in the future this will be done in a smarter way)
    if old_style_vocab(vocab):
        fields = load_old_vocab(
            vocab, opt.model_type, dynamic_dict=opt.copy_attn)
    else:
        fields = vocab

    # @memray: a temporary workaround, as well as train_single.py line 78
    if opt.model_type == "keyphrase":
        if opt.tgt_type in ["one2one", "multiple"]:
            del fields['sep_indices']
        else:
            if 'sep_indices' not in fields:
                sep_indices = Field(
                    use_vocab=False, dtype=torch.long,
                    postprocessing=make_tgt, sequential=False)
                fields["sep_indices"] = sep_indices
        if 'src_ex_vocab' not in fields:
            src_ex_vocab = RawField()
            fields["src_ex_vocab"] = src_ex_vocab

    if len(opt.data_ids) > 1:
        train_shards = []
        for train_id in opt.data_ids:
            shard_base = "train_" + train_id
            train_shards.append(shard_base)
        train_iter = build_dataset_iter_multiple(train_shards, fields, opt)
    else:
        if opt.data_ids[0] is not None:
            shard_base = "train_" + opt.data_ids[0]
        else:
            shard_base = "train"
        train_iter = build_dataset_iter(shard_base, fields, opt)

    nb_gpu = len(opt.gpu_ranks)
    print(os.environ['PATH'])

    if opt.world_size > 1:
        queues = []
        mp = torch.multiprocessing.get_context('spawn')
        semaphore = mp.Semaphore(opt.world_size * opt.queue_size)
        # Create a thread to listen for errors in the child processes.
        error_queue = mp.SimpleQueue()
        error_handler = ErrorHandler(error_queue)
        # Train with multiprocessing.
        procs = []
        for device_id in range(nb_gpu):
            q = mp.Queue(opt.queue_size)
            queues += [q]
            procs.append(mp.Process(target=run, args=(
                opt, device_id, error_queue, q, semaphore), daemon=True))
            procs[device_id].start()
            logger.info(" Starting process pid: %d  " % procs[device_id].pid)
            error_handler.add_child(procs[device_id].pid)
        producer = mp.Process(target=batch_producer,
                              args=(train_iter, queues, semaphore, opt,),
                              daemon=True)
        producer.start()
        error_handler.add_child(producer.pid)

        for p in procs:
            p.join()
        producer.terminate()

    elif nb_gpu == 1:  # case 1 GPU only
        single_main(opt, 0)
    else:   # case only CPU
        single_main(opt, -1)
Пример #12
0
class ApacheDiffTokenHierarchical(ApacheDiffToken):
    NESTING_FIELD = Field(batch_first=True, tokenize=split_string)
    CODE_FIELD = NestedField(NESTING_FIELD, tokenize=split_json)
Пример #13
0
class ApacheDiffToken(TabularDataset):
    NAME = 'ApacheDiffToken'
    NUM_CLASSES = 3
    IS_MULTILABEL = False

    REPO_FIELD = Field(sequential=False,
                       use_vocab=False,
                       batch_first=True,
                       preprocessing=remove_field)
    SHA_FIELD = Field(sequential=False,
                      use_vocab=False,
                      batch_first=True,
                      preprocessing=remove_field)
    CODE_FIELD = Field(batch_first=True,
                       tokenize=split_json_string,
                       include_lengths=True)
    LABEL_FIELD = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        preprocessing=process_labels)

    @staticmethod
    def sort_key(ex):
        return len(ex.code)

    @classmethod
    def splits(cls,
               path,
               train=os.path.join('apache_diff_token', 'train_bal.tsv'),
               validation=os.path.join('apache_diff_token', 'dev_bal.tsv'),
               test=os.path.join('apache_diff_token', 'test_bal.tsv'),
               **kwargs):
        return super(ApacheDiffToken,
                     cls).splits(path,
                                 train=train,
                                 validation=validation,
                                 test=test,
                                 format='tsv',
                                 fields=[('repo', cls.REPO_FIELD),
                                         ('sha', cls.SHA_FIELD),
                                         ('code', cls.CODE_FIELD),
                                         ('label', cls.LABEL_FIELD)])

    @classmethod
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, val, test = cls.splits(path)
        cls.CODE_FIELD.build_vocab(train, val, test, vectors=vectors)
        return BucketIterator.splits((train, val, test),
                                     batch_size=batch_size,
                                     repeat=False,
                                     shuffle=shuffle,
                                     sort_within_batch=True,
                                     device=device)
Пример #14
0
    def __init__(self, config):
        # logger
        self.logger = logging.getLogger(config["name"])

        # data loader params
        self.config = config["data_loader"]["args"]

        data_path = self.config["data_path"]
        self.data_path = data_path
        ensure_dir(data_path)
        self.train_path = os.path.join(data_path, self.config["train_file"])
        self.valid_path = os.path.join(data_path, self.config["valid_file"])
        self.test_path = os.path.join(data_path, self.config["test_file"])

        # limit max text length
        self.context_threshold = self.config["context_threshold"]

        self.logger.info("preprocessing data files...")
        if not os.path.exists(self.train_path) or not os.path.exists(
                self.valid_path):
            self.preprocess(type="train")
        if not os.path.exists(self.test_path):
            self.preprocess(type="test")

        # define filed
        TEXT = Field(sequential=True,
                     use_vocab=True,
                     tokenize=lambda x: x,
                     lower=True,
                     include_lengths=True,
                     batch_first=True)
        LABLE = LabelField(sequential=False, use_vocab=False)

        # build dataset
        self.logger.info("building dataset......")

        train_dict_fileds = {'text': ('text', TEXT), 'label': ('label', LABLE)}

        self.train, self.valid, self.test = TabularDataset.splits(
            path=data_path,  # data root path
            format="json",
            train=self.config["train_file"],
            validation=self.config["valid_file"],
            test=self.config["test_file"],
            fields=train_dict_fileds)

        # build vocab
        self.logger.info("building vocab....")
        TEXT.build_vocab(self.train, self.valid, self.test)

        # load pretrained embeddings
        self.logger.info("load pretrained embeddings...")
        Vectors = vocab.Vectors(self.config["pretrain_emd_file"])
        TEXT.vocab.load_vectors(Vectors)
        # just for call easy
        self.vocab = TEXT.vocab

        # build iterators
        self.logger.info("building iterators.....")
        self.train_iter, self.valid_iter = BucketIterator.splits(
            (self.train, self.valid),
            batch_sizes=(self.config["train_batch_size"],
                         self.config["valid_batch_size"]),
            device=self.config["device"],
            sort_key=lambda x: len(x.text),
            sort_within_batch=False)
        self.test_iter = BucketIterator(
            self.test,
            batch_size=self.config["test_batch_size"],
            device=self.config["device"],
            sort_key=lambda x: len(x.text),
            sort=False,
            sort_within_batch=False)
        self.logger.info("building iterators done!")
        self.logger.info(
            "Total train data set is: {}, valid data set is: {}, test "
            "data is: {}".format(len(self.train), len(self.valid),
                                 len(self.test)))
Пример #15
0
def getData():
    # filename = '.data/multi30k/train.en'
    # generate_sp_model(filename, vocab_size=8000, model_type='bpe', model_prefix='zaid_sp_model')
    # #s = spm.SentencePieceProcessor(model_file='zaid_sp_model.model')
    # print(vars(s))
    # print(dir(s))
    # print(s.vocab_size())
    # print(s.bos_id())#exit()
    # print(s.eos_id())
    # print(s.unk_id())
    # print(s.pad_id())

    #exit()
    sp_gec = load_sp_model("BPE/GCEBPE30k.model")
    #sp_gec = load_sp_model("zaid_sp_model.model")
    # sp_gec =s
    # print(dir(sp_gec))
    # print(vars(sp_gec))
    #exit()
    src_pad_idx = sp_gec.pad_id()  #english_vocab.stoi["<pad>"]
    print("pad_index = ", src_pad_idx)
    #    print("pad = ", sp_gec.decode(src_pad_idx))
    #exit()

    # print("print(len(sp_gec)) 1", len(sp_gec))
    # print(vars(sp_gec))
    # print(dir(sp_gec))
    #exit()

    bpe_field = Field(use_vocab=False,
                      tokenize=sp_gec.encode,
                      init_token=sp_gec.bos_id(),
                      eos_token=sp_gec.eos_id(),
                      pad_token=sp_gec.pad_id(),
                      unk_token=sp_gec.unk_id(),
                      batch_first=True)

    print("-----------------------------------------------")
    #print(TabularDataset.splits.__doc__)
    #tv_datafields = [("ignore", bpe_field), ("trg", bpe_field), ("src", bpe_field)]
    # train_data, valid_data, test_data = TabularDataset.splits(path = "/data/chaudhryz/ankit", train = "test10k.tsv",
    #                                         validation="test10k.tsv", test = "test10k.tsv", format='tsv', skip_header=False, fields = tv_datafields)

    tv_datafields = [("trg", bpe_field), ("src", bpe_field)]
    train_data, valid_data, test_data = TabularDataset.splits(
        path=".data/multi30k",
        train="train.tsv",
        validation="val.tsv",
        test="test2016.tsv",
        format='tsv',
        skip_header=False,
        fields=tv_datafields)

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".ennsw", ".en"), fields=tv_datafields,
    #     train='train',
    #     validation='val',
    #     test='test2016',
    #     path = '.data/multi30k'
    # )
    print(train_data)

    return sp_gec, train_data, valid_data, test_data
Пример #16
0
    train, valid, test = data.SemevalDataset.splits(
        TEXT,
        ASPECT,
        SENTIMENT,
        flat=args.flat_data,
        path=args.filepath,
        train="acsa_train.json.train",
        validation="acsa_train.json.valid",
        test="acsa_test.json",
    )
    data.build_vocab(TEXT, ASPECT, SENTIMENT, train, valid, test)
elif args.data == "sst":
    TEXT, SENTIMENT = (
        Field(tokenize="spacy",
              lower=True,
              include_lengths=True,
              batch_first=True,
              init_token="<bos>",
              eos_token="<eos>"),
        Field(
            lower=True,
            is_target=True,
            unk_token=None,
            pad_token=None,
            batch_first=True,
        ),
    )
    train, valid, test = torchtext.datasets.SST.splits(
        TEXT,
        SENTIMENT,
        fine_grained=args.fine_grained,
        train_subtrees=args.train_subtrees,
Пример #17
0
    pattern = re.compile(r"[ \n\t]+")
    text = pattern.sub(" ", text)
    text = "".join("".join(s)[:2] for _, s in itertools.groupby(text))
    text = re.sub(r'[^A-Za-z0-9,?.!]+', ' ', text)
    return text.strip()


nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])


def tokenizer(s):
    return [w.text.lower() for w in nlp(tweet_clean(s))]


TEXT = Field(sequential=True,
             tokenize=tokenizer,
             include_lengths=True,
             use_vocab=True)
TARGET = Field(sequential=False,
               use_vocab=False,
               pad_token=None,
               unk_token=None,
               is_target=False)

data_fields = [(None, None), ("tweet", TEXT), ("target", TARGET)]


def split_train_test(df, test_size=0.2):
    train, val = train_test_split(df, test_size=test_size, random_state=42)
    return train.reset_index(drop=True), val.reset_index(drop=True)

Пример #18
0

# Tokenizers
def tokenize_eng(text):
    return [tok for tok in eng_tokenizer.encode(text).tokens]


def tokenize_lit(text):
    return [tok for tok in lt_tokenizer.encode(text).tokens]


# Create Fields
english = Field(
    sequential=True,
    use_vocab=True,
    tokenize=tokenize_eng,
    lower=True,
    init_token="<sos>",
    eos_token="<eos>",
)
lithuanian = Field(
    sequential=True,
    use_vocab=True,
    tokenize=tokenize_lit,
    lower=True,
    init_token="<sos>",
    eos_token="<eos>",
)

fields = {"Lithuanian": ("src", lithuanian), "English": ("trg", english)}

# Convert into Tabular Dataset
Пример #19
0
def main():


    data_dir = "/home/donchan/Documents/DATA/jigsaw"

    start_t = time()
    
    vec = vocab.Vectors('glove.6B.100d.txt', '/home/donchan/Documents/DATA/glove_embedding/')

    TEXT = Field(sequential=True, tokenize=tokenizer2, lower=True)
    LABEL = Field(sequential=False, use_vocab=False)

    datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), 
                 ("obscene", LABEL), ("threat", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]

    train, val = TabularDataset.splits(path=data_dir, train='traindf.csv', 
        validation='valdf.csv', format='csv', skip_header=True, fields=datafields)

    print("train val length", len(train), len(val))
    #print( train[0].comment_text )
    #print( train[0].toxic, train[0].severe_toxic, train[0].threat, train[0].insult, train[0].identity_hate  )

    TEXT.build_vocab(train, val, vectors=vec, min_freq=2 )
    #LABEL.build_vocab(train, val)

    print("time to build vocab", (time() - start_t))
    print("length of vocaburary", len(TEXT.vocab), TEXT.vocab.vectors.shape )

    print("- "*20 )
    print("* most common words.")
    print( TEXT.vocab.freqs.most_common(20) )

    train_iter, val_iter = BucketIterator.splits(
        (train, val), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_size,batch_size),
        device=torch.device("cuda"), # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
    )

    train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
    valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])

    x,y = next( iter( train_dl ) )

    em_sz = 100
    nh = 500
    nl = 3

    model_file = os.path.join(data_dir, "jigsaw_model_7978.pkl")
    model = SimpleBiLSTMBaseline( hidden_dim=nh,emb_dim=em_sz,len_TEXT_vocab=len(TEXT.vocab), v_vec=TEXT.vocab.vectors )

    if os.path.isfile( model_file  ):
        print("model file found.", )
        model.load_state_dict( torch.load( model_file ) )
        #model = dill.load(open(model_file,"rb")) 
        #model = torch
    model.cuda()

    opt = optim.Adam(model.parameters(), lr=1e-4)
    loss_func = nn.BCEWithLogitsLoss()

    epochs = 10
    for epoch in range(1, epochs + 1):
        
        running_loss = 0.0
        #running_corrects = 0
        model.train() # turn on training mode
        
        for idx, (x, y) in enumerate( tqdm.tqdm( train_dl ) ): # thanks to our wrapper, we can intuitively iterate over our data!
            opt.zero_grad()

            preds = model(x)
            loss = loss_func(preds, y)
            loss.backward()
            opt.step()
            
            #if idx % 100 == 0:
            #    print("- "*20)
            #    print("step",idx)
            #    print("preds", preds)
            #    print("loss %.5f" % loss.item())

            running_loss += loss.item() * x.size(0)
            
        epoch_loss = running_loss / len(train)
        
        # calculate the validation loss for this epoch
        val_loss = 0.0
        accs = []
        model.eval() # turn on evaluation mode
        for x, y in valid_dl:
            preds = model(x)
            loss = loss_func(preds, y)
            val_loss += loss.item() * x.size(0)

            logits = preds.cpu().data.numpy()
            logits = 1. / (1. + np.exp( -logits ))
            z = np.zeros_like(logits)
            z[ logits > .5 ] = 1
            y_num = y.cpu().data.numpy()
            acc = (z == y_num).sum() / ( y_num.shape[0] * y_num.shape[1] )
            accs.append( acc )

        val_loss /= len(val)

        print("mean accuracy",   np.mean( accs )  )
        print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

        #dill.dump(model, open("jigsaw_model.pkl","wb"))
        torch.save(model.state_dict(), os.path.join(data_dir, "jigsaw_model_%d.pkl" % epoch ) )
Пример #20
0
    stopwords = stopwordslist('/root/news/stopword.txt')  # 这里加载停用词的路径
    return [word for word in jieba.cut(text)
            if word.strip() not in stopwords]  #使用jieba做中文分词并且加载停用词


#加载停用词词库
def stopwordslist(filepath):
    stopwords = [
        line.strip()
        for line in open(filepath, 'r', encoding='utf-8').readlines()
    ]
    return stopwords


#Field类处理数据
TEXT = Field(sequential=True, tokenize=tokenizer,
             fix_length=200)  #使用了分词方法tokenizer
LABEL = Field(sequential=False)

tv_datafields = [("text", TEXT), ("label", LABEL)]

# In[2]:

import re
import os


def CreateDataSet(root):
    # 定义正则表达式
    patternUrl = re.compile(r'<url>(.*?)</url>', re.S)
    patternContent = re.compile(r'<content>(.*?)</content>', re.S)
    contents_list = []
Пример #21
0
def get_fields(src_data_type,
               n_src_feats,
               n_tgt_feats,
               pad='<blank>',
               eos='</s>',
               bos='<s>',
               dynamic_dict=False,
               with_align=False,
               src_truncate=None,
               tgt_truncate=None):
    """
    Args:
        src_data_type: type of the source input. Options are [text|img|audio].
        n_src_feats (int): the number of source features (not counting tokens)
            to create a :class:`torchtext.data.Field` for. (If
            ``src_data_type=="text"``, these fields are stored together
            as a ``TextMultiField``).
        n_tgt_feats (int): See above.
        pad (str): Special pad symbol. Used on src and tgt side.
        bos (str): Special beginning of sequence symbol. Only relevant
            for tgt.
        eos (str): Special end of sequence symbol. Only relevant
            for tgt.
        dynamic_dict (bool): Whether or not to include source map and
            alignment fields.
        with_align (bool): Whether or not to include word align.
        src_truncate: Cut off src sequences beyond this (passed to
            ``src_data_type``'s data reader - see there for more details).
        tgt_truncate: Cut off tgt sequences beyond this (passed to
            :class:`TextDataReader` - see there for more details).
    Returns:
        A dict mapping names to fields. These names need to match
        the dataset example attributes.
    """

    assert src_data_type in ['text', 'img', 'audio', 'vec'], \
        "Data type not implemented"
    assert not dynamic_dict or src_data_type == 'text', \
        'it is not possible to use dynamic_dict with non-text input'
    fields = {}

    fields_getters = {
        "text": text_fields,
        "img": image_fields,
        "audio": audio_fields,
        "vec": vec_fields
    }

    src_field_kwargs = {
        "n_feats": n_src_feats,
        "include_lengths": True,
        "pad": pad,
        "bos": None,
        "eos": None,
        "truncate": src_truncate,
        "base_name": "src"
    }
    fields["src"] = fields_getters[src_data_type](**src_field_kwargs)

    tgt_field_kwargs = {
        "n_feats": n_tgt_feats,
        "include_lengths": True,
        "pad": pad,
        "bos": bos,
        "eos": eos,
        "truncate": tgt_truncate,
        "base_name": "tgt"
    }
    fields["tgt"] = fields_getters["text"](**tgt_field_kwargs)

    indices = Field(use_vocab=False, dtype=torch.long, sequential=False)
    fields["indices"] = indices

    corpus_ids = Field(use_vocab=True, sequential=False)
    fields["corpus_id"] = corpus_ids

    if dynamic_dict:
        src_map = Field(use_vocab=False,
                        dtype=torch.float,
                        postprocessing=make_src,
                        sequential=False)
        fields["src_map"] = src_map

        src_ex_vocab = RawField()
        fields["src_ex_vocab"] = src_ex_vocab

        align = Field(use_vocab=False,
                      dtype=torch.long,
                      postprocessing=make_tgt,
                      sequential=False)
        fields["alignment"] = align

    if with_align:
        word_align = AlignField()
        fields["align"] = word_align

    return fields
Пример #22
0
    from typing import Tuple
    from torch import Tensor
    from torchtext.datasets import Multi30k
    from torchtext.data import Field, BucketIterator

    from utils.config import DEVICE
    from utils.utils import caculate_accuracy

except ImportError as e:
    print(e)
    raise ImportError

# ======================= prepare data ======================= #
SRC = Field(tokenize='spacy',
            tokenizer_language='de_core_news_sm',
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
TRG = Field(tokenize='spacy',
            tokenizer_language='en_core_web_sm',
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

BATCH_SIZE = 128
Пример #23
0
def build_dataset_and_vocab(sentences: List[str]):
    """
    Define source and target fields, iterate over the list of sentences to
    create list of Examples, and return:
        - training and validation dataset (split 90-10%)
        - source and target fields with Vocab object
    """
    # Minimum and maximum length for sentences to be included in the dataset
    min_length, max_length = 4, 10

    # Define source and target fields
    bos_word = '<s>'
    eos_word = '</s>'
    pad_word = '<pad>'
    src_field = Field(tokenize=tokenize_en, pad_token=pad_word, lower=True)
    tgt_field = Field(tokenize=tokenize_en,
                      init_token=bos_word,
                      eos_token=eos_word,
                      pad_token=pad_word,
                      lower=True)

    # Create list of Examples from the list of sentences
    examples = []
    sent_count = 0
    for sentence in sentences:
        sentence_split = sentence.split(' ')
        sentence_length = len(sentence_split)

        if sentence_length <= min_length or sentence_length >= max_length:
            continue
        sent_count += 1

        # If sent length is less than 8
        if sentence_length <= min_length + 4:
            # Src length is 3
            src_length = min_length - 1
        else:
            # Src length is 5
            src_length = min_length + 1

        for i in range(0, sentence_length - src_length, src_length):
            src = ' '.join(sentence_split[i:i + src_length])
            tgt = ' '.join(sentence_split[i + src_length:])

            example = Example.fromlist(data=[src, tgt],
                                       fields=[('src', src_field),
                                               ('tgt', tgt_field)])
            examples.append(example)

    print(
        f'Total {sent_count} sentences processed into {len(examples)} examples.'
    )
    train_dataset, valid_dataset = Dataset(examples=examples,
                                           fields=[
                                               ('src', src_field),
                                               ('tgt', tgt_field)
                                           ]).split(split_ratio=[0.9, 0.1])

    # Set the minimum frequency needed to include a token in the vocabulary
    min_freq = 2
    src_field.build_vocab(train_dataset, min_freq=min_freq)
    tgt_field.build_vocab(train_dataset, min_freq=min_freq)

    return train_dataset, valid_dataset, src_field, tgt_field
Пример #24
0
from torchtext.data import Field,BucketIterator
import spacy
import random
import torch.optim as opt
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

eng=spacy.load('en')
ger=spacy.load('de_core_news_sm')

def Tokenize_eng(text):
  return [a.text for a in eng.tokenizer(text)]
def Tokenize_german(text):
  return [b.text for b in ger.tokenizer(text)]

german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>')
english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>')

Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english))

german.build_vocab(Train,max_size=10000,min_freq=2)
english.build_vocab(Train,max_size=10000,min_freq=2)

##building encoder
class Encode(Module):
  def __init__(self,inp_size,emd_size,hidden_size):
    super(Encode,self).__init__()
    self.inp_size=inp_size
    self.emd_size=emd_size
    self.hidden_size=hidden_size
    self.embed=Embedding(self.inp_size,self.emd_size)
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")


def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)
            ]  #'hello my name is' -> ['hello', 'my', 'name', 'is']


def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)
            ]  #'hello my name is' -> ['hello', 'my', 'name', 'is']


german = Field(tokenize=tokenizer_ger,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")

english = Field(tokenize=tokenizer_eng,
                lower=True,
                init_token="<sos>",
                eos_token="<eos>")

train_data, validation_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                         fields=(german,
                                                                 english))

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

Пример #26
0
    return [tok.text for tok in spacy_de.tokenizer(text)
            ][::-1]  # list[::-1] used to reverse the list


def tokenize_en(text):
    # tokenizes the english text into a list of strings(tokens)
    return [tok.text for tok in spacy_en.tokenizer(text)]


# torchtext's Field handle how the data should be processed. For more refer: https://github.com/pytorch/text

# use the tokenize_de, tokenize_en for tokenization of german and english sentences.
# German is the src, English is the trg
# append the <sos> (start of sentence), <eos> (end of sentence) tokens to all sentences.
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

# we will be using Multi30k dataset. This is a dataset with ~30K parallel English, German, French sentences.

# exts specifies which languages to use as source and target. source goes first
# fields define which data processing to apply for source and target
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))
print('Loaded data...')

# build the vocab
Пример #27
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output file
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    cfg = load_config(cfg_file)
    speech_mode = cfg.get("speech", True)
    if speech_mode:
        raise NotImplementedError(
            "Translation mode isn't implemented for speech processing yet.")

    logger = make_logger()

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
            hypotheses_raw, attention_scores = validate_on_data(
                model, data=test_data, batch_size=batch_size,
                batch_type=batch_type, level=level,
                max_output_length=max_output_length, eval_metric="",
                use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
                beam_alpha=beam_alpha, logger=logger)
        return hypotheses

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"].get("batch_size", 1))
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["data"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    def tok_fun(s):
        return list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, <2: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    if not sys.stdin.isatty():
        # input file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            # write to outputfile if given
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s.", output_path_set)
        else:
            # print to stdout
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        batch_type = "sentence"
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
Пример #28
0
test.to_csv("data/test.csv", index=False)

spacy_eng = spacy.load("en_core_web_sm")
spacy_gem = spacy.load("de_core_news_sm")


def english_tokenizer(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


def german_tokenizer(text):
    return [tok.text for tok in spacy_gem.tokenizer(text)]


english = Field(sequential=True,
                use_vocab=True,
                tokenize=english_tokenizer,
                lower=True)
german = Field(sequential=True,
               use_vocab=True,
               tokenize=german_tokenizer,
               lower=True)

fields = {"english": ("eng", english), "german": ("ger", german)}
train_data, test_data = TabularDataset.splits(path="",
                                              train="data/train_lang.json",
                                              test="data/test_lang.json",
                                              format="json",
                                              fields=fields)

english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)
DATA_PATH = './data' #os.environ['DATA_PATH']
tagger = Mecab()

USE_CUDA = torch.cuda.is_available()
DEVICE = 'cuda' if USE_CUDA else 'cpu'

def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)

train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/',
 train='ratings_train.txt',
 test='ratings_test.txt',
 format='tsv', 
 skip_header=True, 
 fields=[('id',None),('text',TEXT),('label',LABEL)], 
 filter_pred = lambda x: True if len(x.text) > 1 else False) 
# 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

# print (TEXT.vocab)
Пример #30
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)
    trainer.accelerator_backend = GPUAccelerator(trainer)

    # non-transferrable types
    primitive_objects = [
        None, {}, [], 1.0, "x", [None, 2], {
            "x": (1, 2),
            "y": None
        }
    ]
    for batch in primitive_objects:
        data = trainer.accelerator_backend.batch_to_device(
            batch, torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type(
    ) == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{
        'a': torch.rand(2, 3),
        'b': torch.rand(2, 3)
    } for _ in range(2)])
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [
        BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)
    ]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:
        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator_backend.batch_to_device(
        CustomBatchType(), torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Пример #31
0
    # data path
    data_path = "/projdata3/info_fil/olatunji/NLP/Dataset/ARD Amazon/Processed data/"
    data_path2 = "/projdata3/info_fil/olatunji/NLP/Dataset/ARD Amazon/Human annotation/"
    log_interval = 10  # 'how many steps to wait before logging training status [default: 1]')
    test_interval = 100  # 'how many steps to wait before testing [default: 100]')
else:
    device = "cpu"
    # Data  batch size
    batch_size = 64  # 'batch size for training [default: 64]')
    data_path = "C:/Users/hpuser/Documents/Python Sandbox/NLP/New folder/"
    data_path2 = "C:/Users/hpuser/Documents/Python Sandbox/NLP/New folder/"
    log_interval = 1  # 'how many steps to wait before logging training status [default: 1]')
    test_interval = 2  # 'how many steps to wait before testing [default: 100]')

tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float32)

tv_datafields = [("helpful", None), ("xofyHelpfulScore", LABEL),
                 ("overall", None), ("reviewText", TEXT)]
trn, vld = TabularDataset.splits(
    path=data_path,  # the root directory where the data lies
    train='validdatagg.csv',
    validation="validdatagg.csv",
    format='csv',
    skip_header=True,
    fields=tv_datafields)

# print(trn[0].__dict__.keys())
# print(trn[0].reviewText[:3])
Пример #32
0
class IMDBHierarchical(IMDB_stanford):
    NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
    TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=Sentence_Tokenize())
Пример #33
0
def create_dataset(
        config: Config,
        device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]:

    fields = dict()
    fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, RawField())

    time_field = Field(use_vocab=False, batch_first=True, sequential=False)
    fields['jst_hour'] = (SeqType.Time.value, time_field)

    token_field = \
        Field(use_vocab=True,
              init_token=SpecialToken.BOS.value,
              eos_token=SpecialToken.EOS.value,
              pad_token=SpecialToken.Padding.value,
              unk_token=SpecialToken.Unknown.value) \
        if config.use_init_token_tag \
        else Field(use_vocab=True,
                   eos_token=SpecialToken.EOS.value,
                   pad_token=SpecialToken.Padding.value,
                   unk_token=SpecialToken.Unknown.value)

    fields['processed_tokens'] = (SeqType.Token.value, token_field)

    seqtypes = [
        SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort,
        SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong,
        SeqType.StdShort, SeqType.StdLong
    ]

    tensor_type = torch.FloatTensor if device.type == 'cpu' else torch.cuda.FloatTensor
    for (ric, seqtype) in itertools.product(config.rics, seqtypes):
        n = N_LONG_TERM \
            if seqtype.value.endswith('long') \
            else N_SHORT_TERM
        price_field = Field(use_vocab=False,
                            fix_length=n,
                            batch_first=True,
                            pad_token=0.0,
                            preprocessing=lambda xs: [float(x) for x in xs],
                            tensor_type=tensor_type)
        key = stringify_ric_seqtype(ric, seqtype)
        fields[key] = (key, price_field)

    train, val, test = \
        TabularDataset.splits(path=str(config.dir_output),
                              format='json',
                              train='alignment-train.json',
                              validation='alignment-valid.json',
                              test='alignment-test.json',
                              fields=fields)

    token_field.build_vocab(train, min_freq=config.token_min_freq)

    batch_size = config.batch_size
    train_iter, val_iter, test_iter = \
        Iterator.splits((train, val, test),
                        batch_sizes=(batch_size, batch_size, batch_size),
                        device=-1 if device.type == 'cpu' else device,
                        repeat=False,
                        sort=False)

    return (token_field.vocab, train_iter, val_iter, test_iter)
Пример #34
0
class IMDB_stanford(TabularDataset):
    NAME = 'IMDB_stanford'
    NUM_CLASSES = 2
    IS_MULTILABEL = False

    TEXT_FIELD = Field(batch_first=True, tokenize=clean_string)
    LABEL_FIELD = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        preprocessing=process_labels)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls,
               path,
               train=os.path.join('IMDB_stanford', 'train.tsv'),
               validation=os.path.join('IMDB_stanford', 'dev.tsv'),
               test=os.path.join('IMDB_stanford', 'test.tsv'),
               **kwargs):
        return super(IMDB_stanford,
                     cls).splits(path,
                                 train=train,
                                 validation=validation,
                                 test=test,
                                 format='tsv',
                                 fields=[('label', cls.LABEL_FIELD),
                                         ('text', cls.TEXT_FIELD)])

    @classmethod
    def iters(cls,
              path,
              vectors_name=None,
              vectors_cache=None,
              batch_size=64,
              shuffle=True,
              device=0,
              vectors=None,
              unk_init=torch.Tensor.zero_,
              onehot_Flag=False,
              max_size=None,
              sort_within_batch=False,
              bucket_size=300):
        """
        :param path: directory containing train, test, dev files
        :param vectors_name: name of word vectors file
        :param vectors_cache: path to directory containing word vectors file
        :param batch_size: batch size
        :param device: GPU device
        :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
        :param unk_init: function used to generate vector for OOV words
        :return:
        """
        if vectors is None and not onehot_Flag:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)
        if max_size is not None: max_size = max_size - 2
        train, val, test = cls.splits(path)
        cls.TEXT_FIELD.build_vocab(train,
                                   val,
                                   test,
                                   vectors=vectors,
                                   max_size=max_size)
        return Less_padding_bucket_Iterator.splits(
            (train, val, test),
            batch_size=batch_size,
            repeat=False,
            shuffle=shuffle,
            sort_within_batch=sort_within_batch,
            device=device,
            bucket_size=bucket_size)
Пример #35
0
from .config import DEVICE, DEFAULT_CONFIG

seed = 2019
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


def light_tokenize(sequence: str):
    return [sequence]


def post_process(arr, _):
    return [[int(item) for item in arr_item] for arr_item in arr]


TEXT = Field(sequential=True, tokenize=light_tokenize, include_lengths=True)
POS = Field(sequential=True, tokenize=light_tokenize)
REL = Field(sequential=True,
            use_vocab=False,
            unk_token=None,
            pad_token=0,
            postprocessing=post_process)
TAG = Field(sequential=True,
            tokenize=light_tokenize,
            is_target=True,
            unk_token=None)
Fields = [('text', TEXT), ('pos', POS), ('rel', REL), ('tag', TAG)]


class SRLTool(Tool):
    def get_dataset(self, path: str, fields=Fields, separator='\t'):