예제 #1
0
 def prepare_fields(pad_t):
     WORD_field = data.Field(use_vocab=False,
                             batch_first=True,
                             sequential=True,
                             pad_token=pad_t)
     WORD_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=pad_t))
     PAD_field = data.Field(use_vocab=False,
                            batch_first=True,
                            sequential=True,
                            pad_token=0)
     PAD_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=0))
     MASK_nested_field = NestedField(
         data.Field(use_vocab=False,
                    batch_first=True,
                    sequential=True,
                    pad_token=1.))
     fields = {
         'id': data.RawField(),
         'question': data.RawField(),
         'answers': data.RawField(),
         'src': WORD_nested_field,
         'src_mask': PAD_nested_field,
         'doc_mask': MASK_nested_field,
         'target': WORD_field,
         'target_mask': PAD_field,
     }
     return fields
예제 #2
0
    def eval_input_fn():
        path, exts = find_path_and_exts(filenames[0], filenames[1])
        SRC_TEXT = data.RawField(
            preprocessing=lambda s: preprocessing(s, params),
            postprocessing=lambda s: postprocessing(s, params))
        TRG_TEXT = data.RawField(
            preprocessing=lambda s: preprocessing(s, params, add_bos=True),
            postprocessing=lambda s: postprocessing(s, params))
        LABEL_TEXT = data.RawField(
            preprocessing=lambda s: preprocessing(s, params),
            postprocessing=lambda s: postprocessing(s, params))
        MASK = data.Field(sequential=False, use_vocab=False)
        fields = [("source", SRC_TEXT), ("source_mask", MASK),
                  ("target", TRG_TEXT), ("target_mask", MASK),
                  ("label", LABEL_TEXT)]
        dataset = TranslationDataset(path, exts, fields, mode="eval")

        iterator = MTIterator(dataset,
                              params.decode_batch_size,
                              params,
                              mode="eval",
                              continuous=True,
                              sort=False,
                              shuffle=False)

        return iterator
예제 #3
0
 def prepare_fields(pad_t, encode_like_dpr=False):
     WORD_field = data.Field(use_vocab=False,
                             batch_first=True,
                             sequential=True,
                             pad_token=pad_t,
                             fix_length=256 if encode_like_dpr else None)
     return [('id', data.RawField()), ('raw_question', data.RawField()),
             ('input', WORD_field),
             ('segment_mask',
              data.Field(use_vocab=False,
                         batch_first=True,
                         sequential=True,
                         pad_token=0,
                         fix_length=256 if encode_like_dpr else None)),
             ('input_mask',
              data.Field(use_vocab=False,
                         batch_first=True,
                         sequential=True,
                         pad_token=float(encode_like_dpr),
                         fix_length=256 if encode_like_dpr else None)),
             ('pos',
              data.Field(sequential=False,
                         use_vocab=False,
                         batch_first=True,
                         is_target=True)),
             ('hard_neg',
              data.Field(sequential=False,
                         use_vocab=False,
                         batch_first=True,
                         is_target=True)), ('answers', data.RawField()),
             ('human_answer', data.RawField())]
예제 #4
0
def load_dataset(path, include_lengths=True, lower=False, stop_words=None,
                 load_raw=False, load_id=False, float_label=True):
    TEXT = data.Field(include_lengths=include_lengths,
                      lower=lower,
                      stop_words=stop_words)
    label_type = torch.float if float_label else torch.long
    LABEL = data.LabelField(dtype=label_type)
    RAW = data.RawField()
    ID = data.RawField()

    fields = {'text': ('text', TEXT),
              'label': ('label', LABEL)}

    if load_raw:
        fields['raw'] = ('raw', RAW)
        RAW.is_target = True

    if load_id:
        fields['id'] = ('id', ID)
        ID.is_target = True

    splits = data.TabularDataset.splits(
                                path=path,
                                train='train.json',
                                validation='valid.json',
                                test='test.json',
                                format='json',
                                fields=fields)

    return splits, (TEXT, LABEL, RAW, ID)
예제 #5
0
def load_nli_dataset(path, lower=False, stop_words=None,
                     load_raw=True, load_id=True,
                     float_label=False):
    TEXT = data.Field(lower=lower,
                      stop_words=stop_words)
    label_type = torch.float if float_label else torch.long
    LABEL = data.LabelField(dtype=label_type)
    RAW = data.RawField()
    ID = data.RawField()

    fields = {'premise': ('premise', TEXT),
              'hypothesis': ('hypothesis', TEXT),
              'label': ('label', LABEL)}

    if load_raw:
        fields['raw_premise'] = ('raw_premise', RAW)
        fields['raw_hypothesis'] = ('raw_hypothesis', RAW)

    if load_id:
        fields['id'] = ('id', ID)

    splits = data.TabularDataset.splits(
                                path=path,
                                train='train.json',
                                validation='valid.json',
                                test='test.json',
                                format='json',
                                fields=fields)

    return splits, (TEXT, LABEL, RAW, ID)
예제 #6
0
파일: dataset.py 프로젝트: nbrgr/automark
def make_generate_data(config, input_src, input_mt):
    bert_path = config['bert']['path']
    tokenizer = BertTokenizer.from_pretrained(bert_path)
    unk_id = tokenizer.vocab['[UNK]']
    pad_id = tokenizer.vocab['[PAD]']
    unk_fun = lambda: unk_id

    vocab = defaultdict(unk_fun)

    for k, v in tokenizer.vocab.items():
        vocab[k] = v

    src_trg_field = data.Field(eos_token=None,
                               pad_token=pad_id,
                               batch_first=True,
                               include_lengths=True,
                               sequential=True,
                               use_vocab=False)

    mask_field = data.RawField(
        postprocessing=tensorify(batch_fun, torch.float32))

    id_mask = data.RawField(postprocessing=tensorify(batch_fun, torch.long))

    test_data = TestMergeDataset(src_path=input_src,
                                 trg_path=input_mt,
                                 fields=(src_trg_field, mask_field, id_mask),
                                 bos_token='[CLS]',
                                 sep_token='[SEP]',
                                 vocab=vocab)
    return test_data
예제 #7
0
 def mktestset(self, args):
     path = args.path.replace("train", 'test')
     fields = self.fields
     ds = data.TabularDataset(path=path, format='tsv', fields=fields)
     ds.fields["rawent"] = data.RawField()
     for x in ds:
         x.rawent = x.ent.split(" ; ")
         x.ent = self.vec_ents(x.ent, self.ENT)
         x.rel = self.mkGraphs(x.rel, len(x.ent[1]))
         if args.sparse:
             x.rel = (self.adjToSparse(x.rel[0]), x.rel[1])
         x.tgt = x.out
         x.out = [y.split("_")[0] + ">" if "_" in y else y for y in x.out]
         x.sordertgt = torch.LongTensor(
             [int(y) + 3 for y in x.sorder.split(" ")])
         x.sorder = [[int(z) for z in y.strip().split(" ") if len(z) > 0]
                     for y in x.sorder.split("-1")[:-1]]
     ds.fields["tgt"] = self.TGT
     ds.fields["rawent"] = data.RawField()
     ds.fields["sordertgt"] = data.RawField()
     dat_iter = data.Iterator(ds,
                              1,
                              device=args.device,
                              sort_key=lambda x: len(x.src),
                              train=False,
                              sort=False)
     return dat_iter
예제 #8
0
    def __init__(self, args):

        self.args = args

        self.ID = data.RawField()
        self.PID = data.RawField()
        self.TEXT = data.Field(batch_first=True)
        self.POSITION = data.RawField()
예제 #9
0
    def mkVocabs(self, args):
        args.path = args.datadir + args.data
        self.INP = data.Field(sequential=True,
                              batch_first=True,
                              init_token="<start>",
                              eos_token="<eos>",
                              include_lengths=True)
        self.OUTP = data.Field(sequential=True,
                               batch_first=True,
                               init_token="<start>",
                               eos_token="<eos>",
                               include_lengths=True)
        self.TGT = data.Field(sequential=True,
                              batch_first=True,
                              init_token="<start>",
                              eos_token="<eos>")
        self.ENT = data.RawField()
        self.REL = data.RawField()
        self.REL.is_target = False
        self.ENT.is_target = False
        self.fields = [("src", self.INP), ("ent", self.ENT), ("rel", self.REL),
                       ("tgt", self.TGT), ("out", self.OUTP)]
        train = data.TabularDataset(path=args.path,
                                    format='tsv',
                                    fields=self.fields)

        print('building vocab')
        self.OUTP.build_vocab(train, min_freq=args.outunk)
        self.TGT.vocab = copy(self.OUTP.vocab)
        specials = zip(
            "method material otherscientificterm metric task".split(" "),
            range(40))
        for x in specials:
            s = "<" + x[0] + "_" + str(x[1]) + ">"
            self.TGT.vocab.stoi[s] = len(self.TGT.vocab.itos) + x[1]

        self.INP.build_vocab(train, min_freq=args.outunk)
        '''
    self.INP.vocab.stoi['<pad>']=0
    self.INP.vocab.stoi['<unk>']=1
    self.INP.vocab.itos = ['<pad>','<unk>']+self.INP.vocab.itos[2:]
    '''

        self.REL.special = ['<pad>', '<unk>', 'ROOT']
        with open(args.datadir + "/" + args.relvocab) as f:
            rvocab = [x.strip() for x in f.readlines()]
            self.REL.size = len(rvocab)
            rvocab += [x + "_inv" for x in rvocab]
            relvocab = self.REL.special + rvocab
        self.REL.itos = relvocab

        self.ENT.itos, self.ENT.stoi = self.build_ent_vocab(args.path)

        print('done')
 def prepare_fields(self):
     return [
         ('id', data.RawField()),
         ('title', data.RawField()),
         ('psg', data.RawField()),
         ('label',
          data.Field(sequential=False,
                     use_vocab=False,
                     batch_first=True,
                     dtype=torch.float,
                     is_target=True)),
     ]
예제 #11
0
 def _create_examples(self, tokenizer, data_dir):
     return PsychDataset(
         data.TabularDataset(
             data_dir, 'json', {
                 'options-for-correct-answers':
                 ('utterance',
                  data.RawField(
                      utterance_processor(tokenizer,
                                          speaker=self.agent[0].upper()
                                          if self.agent else None))),
                 'messages-so-far':
                 ('context', data.RawField(context_processor(tokenizer)))
             }))
예제 #12
0
def load_dataset(batch_size, path_data):
    tokenize = lambda x: x.split()
    captions = data.Field(sequential=True,
                          tokenize=tokenize,
                          lower=True,
                          include_lengths=True,
                          batch_first=True,
                          fix_length=1500)
    category = data.Field(sequential=False)
    channel_id = data.RawField()
    channel_id.is_target = False
    chunk = data.RawField()
    chunk.is_target = False
    video_id = data.RawField()
    video_id.is_target = False

    seeds_captions = data.TabularDataset(path=path_data,
                                         format='csv',
                                         skip_header=True,
                                         fields=[('captions', captions),
                                                 ('category', category),
                                                 ('channel_id', channel_id),
                                                 ('chunk', chunk),
                                                 ('video_id', video_id)])

    train_data, test_data = seeds_captions.split(split_ratio=0.8,
                                                 stratified=True,
                                                 strata_field='category')
    train_data, valid_data = train_data.split(split_ratio=0.8,
                                              stratified=True,
                                              strata_field='category')

    captions.build_vocab(train_data,
                         test_data,
                         valid_data,
                         vectors=GloVe(name='6B', dim=300))
    category.build_vocab(train_data, specials_first=False)

    vocab_size = len(captions.vocab)
    class_size = len(category.vocab) - 1

    print(vocab_size)

    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.captions),
        repeat=False,
        shuffle=True)

    return captions, category, train_iter, valid_iter, test_iter, vocab_size, class_size
예제 #13
0
    def iters(cls, config, **kwargs):
        """
        Create the iterator objects for splits of the SemEval dataset.
        :param batch_size: Batch_size
        :param device: Device to create batches, -1 for CPU and None for GPU.
        :param root: The root directory containing datasets files.
        :param vectors: Load pretrained vectors
        :param kwargs:
        :return:
        """

        vectors = vocab.Vectors(name=config.vectors, cache=config.cache)

        ID = data.RawField()
        TEXT = data.Field(batch_first=True,
                          tokenize=lambda x: x,
                          fix_length=20)
        TAG = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20)
        RAW = data.RawField()
        REL = data.Field(sequential=False,
                         use_vocab=False,
                         batch_first=True,
                         tensor_type=torch.FloatTensor,
                         postprocessing=data.Pipeline(get_class_probs))
        CONF = data.RawField()

        #TAG.preprocessing = shrink_chunk

        train, val, test = cls.splits(ID,
                                      TEXT,
                                      REL,
                                      CONF,
                                      RAW,
                                      TAG,
                                      root=config.datasets_dir,
                                      **kwargs)

        TEXT.build_vocab(train)
        config.n_embed = len(TEXT.vocab)
        config.d_embed = vectors.dim
        TEXT.vocab.load_vectors(vectors)

        config.weights = TEXT.vocab.vectors

        config.n_classes = 2

        return data.BucketIterator.splits((train, val, test),
                                          batch_size=config.batch_size,
                                          shuffle=config.shuffle,
                                          device=config.device,
                                          repeat=False)
예제 #14
0
    def splits(cls, text_field, root_path, img_dir, **kwargs):
        train, valid, test = 'train.pkl', 'val.pkl', 'test.pkl'
        fields = {
            'image_id': ('image_id', data.RawField()),
            'img_1c_feat': ('img_1c_feat', data.RawField()),
            'img_to_load': ('img_to_load', data.RawField()),
            'caption': ('caption', text_field),
            'caption_id': ('caption_id', data.RawField()),
        }

        train_data = None if train is None else cls(root_path, img_dir, train, fields, True, **kwargs)
        val_data = None if valid is None else cls(root_path, img_dir, valid, fields, False, **kwargs)
        test_data = None if test is None else cls(root_path, img_dir, test, fields, False, **kwargs)
        return tuple(d for d in (train_data, val_data, test_data) if d is not None)
예제 #15
0
    def mkiters(self, train):
        args = self.args
        c = Counter([len(x.out) for x in train])
        t1, t2, t3 = [], [], []
        print("Sorting training data by len")
        for x in train:
            l = len(x.out)
            if l < 100:
                t1.append(x)
            elif l > 100 and l < 220:
                t2.append(x)
            else:
                t3.append(x)
        t1d = data.Dataset(t1, self.fields)
        t2d = data.Dataset(t2, self.fields)
        t3d = data.Dataset(t3, self.fields)
        valid = data.TabularDataset(path=args.path.replace("train", "val"),
                                    format='tsv', fields=self.fields)
        print("ds sizes:", end='\t')
        for ds in [t1d, t2d, t3d, valid]:
            print(len(ds.examples), end='\t')
            for x in ds:
                x.rawent = x.ent.split(" ; ")
                x.ent = self.vec_ents(x.ent, self.ENT)
                x.rel = self.mkGraphs(x.rel, len(x.ent[1]))
                if args.sparse:
                    x.rel = (self.adjToSparse(x.rel[0]), x.rel[1])
                x.tgt = x.out
                x.out = [y.split("_")[0] + ">" if "_" in y else y for y in
                         x.out]
                x.sordertgt = torch.LongTensor(
                    [int(y) + 3 for y in x.sorder.split(" ")])
                x.sorder = [[int(z) for z in y.strip().split(" ")] for y in
                            x.sorder.split("-1")[:-1]]
            ds.fields["tgt"] = self.TGT
            ds.fields["rawent"] = data.RawField()
            ds.fields["sordertgt"] = data.RawField()

        self.t1_iter = data.Iterator(t1d, args.t1size, device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False, train=True)
        self.t2_iter = data.Iterator(t2d, args.t2size, device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False, train=True)
        self.t3_iter = data.Iterator(t3d, args.t3size, device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False, train=True)
        self.val_iter = data.Iterator(valid, args.t3size, device=args.device,
                                      sort_key=lambda x: len(x.out), sort=False,
                                      repeat=False, train=False)
예제 #16
0
    def preprocess(self):
        print("\nLoading data...  ", end="", flush=True)

        process = MakeLabelVector()
        set_label_vector = process.set_label_vector
        get_label_vector = process.get_label_vector

        # Define fields for torchtext
        length = self.params["sequence_length"]
        self.ID = data.RawField(is_target=False)
        self.LABEL = data.RawField(set_label_vector, get_label_vector, True)
        self.TEXT = data.Field(sequential=True, lower=True, fix_length=length)

        fields = [
            ("id", self.ID),
            ("label", self.LABEL),
            ("text", self.TEXT),
        ]

        datasets = data.TabularDataset.splits(
            path="./",
            train=self.params["train_data_path"],
            validation=self.params["valid_data_path"],
            test=self.params["test_data_path"],
            format="tsv",
            fields=fields,
        )

        if self.params["params_search"]:
            self.train, self.valid = datasets
        else:
            self.train, self.valid, self.test = datasets

        print("Done.", flush=True)

        # Convert words to ID
        print("Converting text to ID...  ", end="", flush=True)
        if self.params["params_search"]:
            self.TEXT.build_vocab(self.train, self.valid)
        else:
            self.TEXT.build_vocab(self.train, self.valid, self.test)

        self.TEXT.vocab.load_vectors("glove.6B.300d")
        print("Done.\n", flush=True)

        # Add parameters that havn't yet been defined
        self.params["uniq_of_cat"] = process.uniq_of_cat
        self.params["num_of_class"] = len(process.uniq_of_cat)
예제 #17
0
    def __init__(self, batch_size, word_dim):
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='.data/quora',
            train='train.tsv',
            validation='dev.tsv',
            test='test.tsv',
            format='tsv',
            fields=[('label', self.LABEL), ('q1', self.TEXT),
                    ('q2', self.TEXT), ('id', self.RAW)])

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='6B', dim=word_dim))
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       device=-1,
                                       batch_sizes=[batch_size] * 3,
                                       sort_key=sort_key)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
예제 #18
0
    def make_test(self, args):
        path = args.path.replace("train", 'test')
        fields = self.fields
        dataset = data.TabularDataset(path=path, format='tsv', fields=fields)
        dataset.fields["rawent"] = data.RawField()
        dataset.fields["rawent"].is_target = False

        for row in dataset:
            row.rawent = row.ent.split(" ; ")
            row.ent = self.vectorize_entity(row.ent, self.ENT)
            # row.ent: tuple of ((# of entities in x, max entity len), (# of entities))
            row.rel = self.make_graph(row.rel, len(row.ent[1]))
            # x.rel: tuple of (adj, rel)

            row.tgt = row.out
            row.out = [
                token.split("_")[0] + ">" if "_" in token else token
                for token in row.out
            ]

        dataset.fields["tgt"] = self.TARGET

        test_iter = data.Iterator(dataset,
                                  1,
                                  device=args.device,
                                  sort_key=lambda x: len(x.title),
                                  train=False,
                                  sort=False)
        return test_iter
예제 #19
0
    def __init__(self, args):
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='.data/trecqa',
            train='train.tsv',
            validation='dev.tsv',
            test='test.tsv',
            format='tsv',
            fields=[('label', self.LABEL),
                    ('s1', self.TEXT),
                    ('s2', self.TEXT)])

        self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=GloVe(name='840B', dim=300))
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.s1), len(x.s2))

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_sizes=[args.batch_size] * 3,
                                       device=args.gpu,
                                       sort_key=sort_key)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len, [0] * self.max_word_len]

        if args.use_char_emb:
            self.build_char_vocab()
예제 #20
0
    def __init__(self, args):
        self.RAW = data.RawField(is_target=False)
        self.TEXT = data.Field(batch_first=True, tokenize='spacy', lower=True)
        # self.LABEL = data.Field(sequential=False, unk_token=None)
        self.LABEL = data.LabelField()

        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='/media/fch/Data/leo/text-similarity/data/quora',
            train='train.tsv',
            validation='dev.tsv',
            test='test.tsv',
            format='tsv',
            fields=[('label', self.LABEL),
                    ('q1', self.TEXT),
                    ('q2', self.TEXT),
                    ('id', self.RAW)])
        vectors = Vectors(name='/media/fch/Data/leo/text-similarity/glove/glove.840B.300d.txt',
                          cache='/media/fch/Data/leo/text-similarity/.vector_cache')
        self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=vectors)
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))

        self.train_iter, self.dev_iter, self.test_iter = data.BucketIterator.splits((self.train, self.dev, self.test),
                                                                                    batch_sizes=[args.batch_size] * 3,
                                                                                    device=args.device,
                                                                                    sort_key=sort_key)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
예제 #21
0
파일: dataset.py 프로젝트: nbrgr/automark
    def __init__(self,
                 src_path,
                 trg_path,
                 fields,
                 bos_token='[CLS]',
                 sep_token='[SEP]',
                 vocab=None,
                 **kwargs):

        src_len = data.RawField(
            postprocessing=tensorify(identity_fun, torch.long))
        trg_len = data.RawField(
            postprocessing=tensorify(identity_fun, torch.long))
        attention_mask = data.RawField(
            postprocessing=tensorify(batch_fun, torch.long))

        if not isinstance(fields[0], (tuple, list)):
            fields = [('src_trg', fields[0]), ('label_mask', fields[1]),
                      ('id_mask', fields[2]), ('src_len', src_len),
                      ('trg_len', trg_len), ('attention_mask', attention_mask)]

        examples = []
        with open(src_path) as src_file, open(trg_path) as trg_file:
            for src_line, trg_line in \
                    zip(src_file, trg_file):
                src_line, trg_line = src_line.strip().split(" "), \
                                     trg_line.strip().split(" ")

                src_line = [vocab[bos_token]] + [vocab[x] for x in src_line] + \
                           [vocab[sep_token]]
                trg_line = [vocab[x] for x in trg_line]
                label_mask = [0.0] * len(src_line) + [1.0] * len(trg_line)

                if src_line != '' and trg_line != '':
                    merged_line = src_line + trg_line
                    att_mask = [1] * len(merged_line)
                    id_mask = [0] * len(src_line) + [1] * len(trg_line)
                    assert len(merged_line) == len(id_mask)
                    assert len(label_mask) == len(merged_line)
                    examples.append(
                        data.Example.fromlist([
                            merged_line, label_mask, id_mask,
                            len(src_line),
                            len(trg_line), att_mask
                        ], fields))

        super(TestMergeDataset, self).__init__(examples, fields, **kwargs)
예제 #22
0
    def __init__(self, args):
        path = '.data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        self.RAW = data.RawField()
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            dev_examples = torch.load(dev_examples_path)

            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.dev = data.TabularDataset(path=path + f'/dev-v1.1.jsonl',
                                           format='json',
                                           fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.dev.examples, dev_examples_path)

        print("building vocab...")
        self.CHAR.build_vocab(self.dev, min_freq=10000)
        self.WORD.build_vocab(self.dev,
                              vectors=GloVe(name='6B', dim=args.word_dim),
                              max_size=80000)
        device = torch.device(
            f"cuda:0" if torch.cuda.is_available() else "cpu")
        self.dev_iter = \
            data.BucketIterator(self.dev,
                                batch_size=60,
                                device=device,
                                sort=True,
                                sort_key=lambda x: len(x.c_word))
예제 #23
0
파일: nli.py 프로젝트: swanhtet1992/DME
 def get_fields(cls, text_field, label_field, with_genre=False):
     fields = {
         'label': ('label', label_field),
         'sentence1': ('premise', text_field),
         'sentence2': ('hypothesis', text_field)
     }
     if with_genre:
         fields['genre'] = ('genre', data.RawField())
     return fields
예제 #24
0
파일: data.py 프로젝트: jangdn/nlpserver
	def __init__(self,args):
		path = './data/squad'
		dataset_path = path +'/torchtext/'
		train_examples_path = dataset_path + 'train_examples.pt'
		dev_examples_path = dataset_path +'dev_examples.pt'

		print ("[+] Preprocessing data files..")

		if not os.path.exists(f'{path}/{args.train_file}l'): # what's the l chracter means?
 			self.preprocess_file(f'{path}/{args.train_file}')
		if not os.path.exists(f'{path}/{args.dev_file}l'):
			self.preprocess_file(f'{path}/{args.dev_file}')

		self.RAW = data.RawField()
		self.CHAR_NESTING = data.Field(batch_first=True,tokenize=list,lower=True ) # tokenize list? # nesting filed? char -> [c,h,a,r]?
		self.CHAR = data.NestedField(self.CHAR_NESTING,tokenize=word_tokenize) 
		# In this line, what's the mean of Nested Field ( I thinck that Nested filed contains other filed ). In this case, self.charnetsting is chracter based tokenizer 
		self.WORD = data.Field(batch_first =True,tokenize=word_tokenize,lower=True,include_lengths=True)
		self.LABEL = data.Field(sequential=False,unk_token=None,use_vocab=False)

		dict_field = { 'id' : ('id',self.RAW) ,
					's_idx':('s_idx',self.LABEL),
					'e_idx':('e_idx',self.LABEL),
					'context': [('c_word',self.WORD),('c_char',self.CHAR)],
					'questions':[('q_word',self.WORD),('q_char',self.CHAR)]}
		list_field = [ ('id',self.RAW) ,('s_idx',self.LABEL),('e_idx',self.LABEL),
					('c_word',self.WORD),('c_char',self.CHAR),('q_word',self.WORD),('q_char',self.CHAR)]

		if os.path.exists(dataset_path):
			print ("[+] Loading splits....")
			train_examples = torch.load(train_examples_path)
			dev_examples = torch.load(dev_examples_path)

			self.train = data.Dataset(examples=train_examples,fields=list_field)
			self.dev = data.Dataset(examples=dev_examples,fields=list_field)

		else:
			print ('[+] building splits...')
			self.train,self.dev = data.TabularDataset.splits(
								path=path,train=f'{args.train_file}l',
								validation=f'{args.dev_file}l',
								format='json',
								fields=dict_field)
			os.makedir(dataset_path)
			torch.save(self.train.examples,train_examples_path)
			torch.save(self.dev.examples,dev_examples_path)
		
		# cut too long context in the training set for efficiency 

		if args.context_threshold > 0: 
			self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]

		print ("building iterators...")
		self.train_tier,self.dev_iter = \
			data.BucketIterator.splits((self.train,self.dev),batch_size = [args.train_batch_size,args.dev_batch_size],
								device=args.gpu_num ,sort_key = lambda x: len(x.c_word))
예제 #25
0
파일: tree.py 프로젝트: wconstab/benchmark
def main():
    global WORD
    WORD = data.Field(include_lengths=True,
                      batch_first=True,
                      eos_token=None,
                      init_token=None)
    LABEL = data.Field(sequential=False, batch_first=True)
    TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD))
    TREE.is_target = False
    train = ListOpsDataset(
        "data/train_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < config["train_len"],
    )
    WORD.build_vocab(train)
    LABEL.build_vocab(train)
    valid = ListOpsDataset(
        "data/test_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < 150,
    )

    train_iter = TokenBucket(train,
                             batch_size=1500,
                             device="cuda:0",
                             key=lambda x: len(x.word))
    train_iter.repeat = False
    valid_iter = data.BucketIterator(train,
                                     batch_size=50,
                                     train=False,
                                     sort=False,
                                     device="cuda:0")

    NT = 1
    T = len(WORD.vocab)
    V = T

    if True:
        tree_lstm = TreeLSTM(config["H"],
                             len(WORD.vocab) + 100, len(LABEL.vocab)).cuda()
        for p in tree_lstm.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda()
        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        wandb.watch((model, tree_lstm))
        print(wandb.config)
        tree = run_train(train_iter, valid_iter, model, tree_lstm, V)
    else:
        print("loading")
        model, tree_lstm = torch.load("cp.yoyo.model")
        print(valid_sup(valid_iter, model, tree_lstm, V))
예제 #26
0
    def __init__(self,
                 path,
                 text_field,
                 label_field,
                 keepneutral=False,
                 neutral=None,
                 size=None,
                 shuffle=True,
                 **kwargs):

        # Get the Standford dataset fields
        SENT_ID = data.RawField()
        DATE = data.RawField()
        QUERY = data.RawField()
        USER = data.RawField()
        fields = [("label", label_field), ("id", SENT_ID), ("date", DATE),
                  ("query", QUERY), ("user", USER), ("text", text_field)]

        # Create the torchtext dataset for all examples
        examples = []
        df = pd.read_csv(
            path,
            encoding='latin-1',
            header=0,
            names=["label", "id", "date", "query", "user", "text"])

        if shuffle:
            df = df.sample(frac=1)

        if neutral is not None:
            df_neutral = pd.read_csv(neutral, index_col=0, header=None).T
            df_neutral.columns = ["text"]
            df_neutral["label"] = [2] * len(df_neutral)
            df = pd.concat([df_neutral, df])

        for (_, entry) in df.iloc[0:size].iterrows():
            if not keepneutral and entry["label"] == 2:
                continue
            example = data.Example.fromlist(entry, fields)
            examples.append(example)

        super(Sentiment140, self).__init__(examples, fields, **kwargs)
    def __init__(self,
                 data_path,
                 glove_size,
                 batch_size,
                 train_file='train.csv',
                 dev_file='dev.csv'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Defining the Fields
        self.RAW = data.RawField(is_target=False)
        self.WORDS = data.Field(batch_first=True,
                                tokenize=post_ptbtokenizer,
                                lower=True,
                                include_lengths=True)
        self.CHAR = data.NestedField(data.Field(batch_first=True,
                                                tokenize=list,
                                                lower=True),
                                     tokenize=post_ptbtokenizer)

        self.INDEX = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        fields = {
            'id': ('id', self.RAW),
            'context_ptb_tok': [('context_words', self.WORDS),
                                ('context_char', self.CHAR)],
            'question_ptb_tok': [('question_words', self.WORDS),
                                 ('question_char', self.CHAR)],
            'answer_ptb_tok': [('answer_words', self.WORDS),
                               ('answer_char', self.CHAR)],
            'start_idx': ('start_idx', self.INDEX),
            'end_idx': ('end_idx', self.INDEX)
        }

        print('Loading CSV Data Into Torch Tabular Dataset')
        self.train, self.dev = data.TabularDataset.splits(path=data_path,
                                                          train=train_file,
                                                          validation=dev_file,
                                                          format='csv',
                                                          fields=fields)

        print('Building Vocabulary')
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORDS.build_vocab(self.train,
                               self.dev,
                               vectors=GloVe(name='6B', dim=glove_size))

        print('Creating Iterators')
        self.train_iter = PreprocessData.create_train_iterator(
            self.train, device, batch_size)
        self.dev_iter = PreprocessData.create_dev_iterator(
            self.dev, device, batch_size)
예제 #28
0
    def __init__(self, args):
        if args.datastories:
            tokenizer = SocialTokenizer(lowercase=True)
        else:
            tokenizer = TweetTokenizer()
        self.RAW = data.RawField()
        self.TEXT = data.Field(batch_first=True,
                               include_lengths=True,
                               lower=True,
                               tokenize=tokenizer.tokenize)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        self.train, self.dev, self.test = datasets.EMO.splits(
            args, self.RAW, self.TEXT, self.LABEL, args.train_data_path,
            args.valid_data_path, args.test_data_path)

        self.TEXT.build_vocab(self.train,
                              self.dev,
                              self.test,
                              vectors=GloVe(name='840B', dim=300))

        if args.fasttext:
            self.FASTTEXT = data.Field(batch_first=True,
                                       include_lengths=True,
                                       lower=True,
                                       tokenize=tokenizer.tokenize)
            self.FASTTEXT.vocab = copy.deepcopy(self.TEXT.vocab)
            self.FASTTEXT.vocab.set_vectors(self.FASTTEXT.vocab.stoi,
                                            vectors=FastText(language='en'),
                                            dim=300)
        self.LABEL.build_vocab(self.train)

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_size=args.batch_size,
                                       device=args.device,
                                       repeat=False)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.char_emb:
            self.build_char_vocab()

        filehandler = open('./data/vocab.obj', 'wb')
        pickle.dump(self.TEXT.vocab, filehandler)
        filehandler = open('./data/label.obj', 'wb')
        pickle.dump(self.LABEL.vocab, filehandler)
예제 #29
0
def load_dataset_for_transformer(path, tokenizer, lower=False,
                                 stop_words=None, load_raw=True,
                                 load_id=True, float_label=True,
                                 max_len=512):
    
    postpro = lambda xs, _: [tokenizer.convert_tokens_to_ids(x[:max_len])
                             for x in xs]

    TEXT = data.Field(use_vocab=False,
                      postprocessing=postpro,
                      pad_token=tokenizer.pad_token_id,
                      lower=lower,
                      stop_words=stop_words)
    label_type = torch.float if float_label else torch.long
    LABEL = data.LabelField(dtype=label_type)
    RAW = data.RawField()
    ID = data.RawField()

    fields = {'text': ('text', TEXT),
              'label': ('label', LABEL)}

    if load_raw:
        fields['raw'] = ('raw', RAW)
        RAW.is_target = True

    if load_id:
        fields['id'] = ('id', ID)
        ID.is_target = True

    splits = data.TabularDataset.splits(
                                path=path,
                                train='train.json',
                                validation='valid.json',
                                test='test.json',
                                format='json',
                                fields=fields)

    return splits, (TEXT, LABEL, RAW, ID)
예제 #30
0
    def __init__(self, args):
        #        self.RAW = data.RawField(is_target=False)
        self.RAW = data.RawField()
        self.RAW.is_target = False
        # tokenizer = lambda x:list(jieba.cut(x))
        self.tokenize = lambda x: [char for char in x]
        self.TEXT = data.Field(batch_first=True,
                               fix_length=32,
                               tokenize=self.tokenize)
        self.LABEL = data.LabelField()

        self.train, self.dev, self.test = data.TabularDataset.splits(
            # path = './Bimpm/data/Docomo',
            path='/home/lsy2018/TextClassification/DATA/DATA_DOUBAN/data_1024/',
            train='train.csv',
            validation='dev.csv',
            test='test.csv',
            format='csv',
            fields=[('id', self.RAW), ('q1', self.TEXT), ('q2', self.TEXT),
                    ('label', self.LABEL)])
        # vectors = Vectors(name='/home/fch/leo/text-similarity/BIMPM_new/.vector_cache/glove.840B.300d.txt')
        # vectors = Vectors(name='./data/Glove/glove.6B.300d.txt')
        vectors = Vectors(
            name=
            '/home/lsy2018/wlw/Bimpm/data/Embedding/sgns.financial.bigram-char_cleaned.txt'
        )
        self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=vectors)
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))

        self.train_iter, self.dev_iter, self.test_iter = \
            data.BucketIterator.splits((self.train, self.dev, self.test),
                                       batch_sizes=[args.batch_size] * 3,  # [args.batch_size] * 3,
                                       device=args.device,
                                       sort_key=sort_key)
        #        print('train_iter:',type(self.train_iter),self.train.shape)
        #        print('test_iter:',type(self.test_iter),self.test.shape)
        #        print('dev_iter:',type(self.dev_iter),self.dev.shape)

        self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos])
        # for <pad>
        self.char_vocab = {'': 0}
        # for <unk> and <pad>
        self.characterized_words = [[0] * self.max_word_len,
                                    [0] * self.max_word_len]

        if args.use_char_emb:
            self.build_char_vocab()