def merge_data(self, pos, neg, device): # FIXME: maybe just Field? label_field = RawField(postprocessing=lambda x: torch.cuda.LongTensor( x, device=device)) label_field.is_target = True examples = [self._attach_label(ex, POS_LABEL) for ex in pos] +\ [self._attach_label(ex, NEG_LABEL) for ex in neg] return Dataset(examples, [('sent', self.sent_field), ('label', label_field)])
# parse conll dependency data model_class, tokenizer_class, pretrained_weights = BertModel, BertTokenizer, config['bert'] tokenizer = tokenizer_class.from_pretrained(pretrained_weights) def batch_num(nums): lengths = torch.tensor([len(n) for n in nums]).long() n = lengths.max() out = torch.zeros(len(nums), n).long() for b, n in enumerate(nums): out[b, :len(n)] = torch.tensor(n) return out, lengths HEAD = RawField(preprocessing=lambda x: [int(i) for i in x], postprocessing=batch_num) HEAD.is_target = True WORD = SubTokenizedField(tokenizer) def len_filt(x): return 5 < len(x.word[0]) < 40 train = ConllXDataset('wsj.train.conllx', (('word', WORD), ('head', HEAD)), filter_pred=len_filt) train_iter = TokenBucket(train, 750) val = ConllXDataset('wsj.dev.conllx', (('word', WORD), ('head', HEAD)), filter_pred=len_filt) val_iter = BucketIterator(val, batch_size=20, device='cuda:0') # make bert model to compute potentials H = config['H'] class Model(nn.Module): def __init__(self, hidden):
def create_dataset(config: Config, device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]: fields = dict() raw_field = RawField() # torchtext 0.3.1 # AttributeError: 'RawField' object has no attribute 'is_target' raw_field.is_target = False fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, raw_field) time_field = Field(use_vocab=False, batch_first=True, sequential=False) fields['jst_hour'] = (SeqType.Time.value, time_field) token_field = \ Field(use_vocab=True, init_token=SpecialToken.BOS.value, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) \ if config.use_init_token_tag \ else Field(use_vocab=True, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) fields['processed_tokens'] = (SeqType.Token.value, token_field) seqtypes = [SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort, SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong, SeqType.StdShort, SeqType.StdLong] for (ric, seqtype) in itertools.product(config.rics, seqtypes): n = N_LONG_TERM \ if seqtype.value.endswith('long') \ else N_SHORT_TERM price_field = Field(use_vocab=False, fix_length=n, batch_first=True, pad_token=0.0, preprocessing=lambda xs: [float(x) for x in xs], dtype=torch.float) key = stringify_ric_seqtype(ric, seqtype) fields[key] = (key, price_field) train, val, test = \ TabularDataset.splits(path=str(config.dir_output), format='json', train='alignment-train.json', validation='alignment-valid.json', test='alignment-test.json', fields=fields) token_field.build_vocab(train, min_freq=config.token_min_freq) batch_size = config.batch_size train_iter, val_iter, test_iter = \ Iterator.splits((train, val, test), batch_sizes=(batch_size, batch_size, batch_size), device=-1 if device.type == 'cpu' else device, repeat=False, sort=False) return (token_field.vocab, train_iter, val_iter, test_iter)