Exemplo n.º 1
0
    def __init__(
            self,
            question_path,
            paragraph_path,
            ratio,
            batch_size,
            vocab: Vocab = Ref("model.vocab"),
            batch_first=Ref("model.batch_first", True),
    ):
        self.vocab = vocab
        question = Field(include_lengths=True,
                         batch_first=batch_first,
                         pad_token=vocab.pad_token)
        question.vocab = vocab
        paragraph = Field(batch_first=batch_first, pad_token=vocab.pad_token)
        paragraph.vocab = vocab
        paragraphs = NestedField(paragraph, include_lengths=True)
        paragraphs.vocab = vocab
        target = Field(sequential=False, use_vocab=False, is_target=True)

        fields = [("question", question), ("paragraphs", paragraphs),
                  ("target", target)]
        examples = []
        with open(paragraph_path) as paragraph_file, open(
                question_path) as question_file:
            for q in question_file:
                q = q.strip()
                ps = [paragraph_file.readline().strip() for _ in range(ratio)]
                examples.append(Example.fromlist([q, ps, 0], fields))

        BaseIRDataset.__init__(self, ratio, batch_size, batch_first)
        TorchTextDataset.__init__(self, examples, fields)
Exemplo n.º 2
0
 def build_field_vocab(cls,
                       field: Field,
                       counter: Counter,
                       size_multiple: int = 1,
                       **kwargs):
     # PN: original name was _build_field_vocab
     # this is basically copy-pasted from torchtext.
     all_specials = [
         field.unk_token, field.pad_token, field.init_token,
         field.eos_token, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
         "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
         "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31",
         "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42",
         "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53",
         "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64",
         "65", "66", "67", "68", "69", "70", "71", "72", "73", "74", "75",
         "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86",
         "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97",
         "98", "99", "100", "101", "102", "103", "104", "105", "106", "107",
         "108", "109", "110", "111", "112", "113", "114", "115", "116",
         "117", "118", "119", "120", "121", "122", "123", "124", "125",
         "126", "127"
     ]
     specials = [tok for tok in all_specials if tok is not None]
     field.vocab = field.vocab_cls(counter, specials=specials, **kwargs)
     if size_multiple > 1:
         cls.pad_vocab_to_multiple(field.vocab, size_multiple)
     return
Exemplo n.º 3
0
def attach_tokenizer(field: Field, tokenizer: PreTrainedTokenizer) -> None:
    """Creates a tokenizer that is attached to a Corpus Field.

    Parameters
    ----------
    field : Field
        Field to which the vocabulary will be attached
    tokenizer : PreTrainedTokenizer
        Tokenizer that will convert tokens to their index.
    """

    def preprocess(value: Union[str, List[str]]) -> List[str]:
        """We only perform the splitting as a preprocessing step.

        This allows us to still have access to the original tokens,
        including those that will be mapped to <unk> later.
        """
        if isinstance(value, list):
            value = " ".join(value)

        return [tokenizer.convert_ids_to_tokens(t) for t in tokenizer.encode(value)]

    field.preprocessing = preprocess
    field.pad_token = tokenizer.pad_token
    field.vocab = tokenizer
    field.vocab.stoi = tokenizer.vocab
Exemplo n.º 4
0
def load_field(vocab_path: str) -> Field:
    field = Field(init_token='<bos>',
                  eos_token='<eos>',
                  batch_first=True,
                  pad_first=True)
    field.vocab = pickle.load(open(vocab_path, 'rb'))

    return field
Exemplo n.º 5
0
def create_datasets(
        data_path: str,
        mode: str,
        word_to_ix=None,
        word_vocab=None,
        tag_vocab=None
) -> Union[createDatasetsReturnType, BucketIterator]:
    """
    Used when bert embeddings are switched off (i.e. we just used randomly initialized embeddings.
    Compiles the data first into a TabularDataset object and then into a BucketIterator
    (similar to a DataLoader) object via the function to_iter().
    """
    sent_field = Field(lower=True)
    tag_field = Field()
    data_fields = [('sentence', sent_field), ('tags', tag_field)]
    if mode == TRAIN:
        dataSetNames = [TRAIN, VAL]
    elif mode == TEST:
        dataSetNames = [TEST]
    for data_set in dataSetNames:
        create_csv(os.path.join(data_path, data_set))
    if mode == TRAIN:
        train_dataset, val_dataset = TabularDataset.splits(path=data_path,
                                                           train='train.csv',
                                                           validation='val.csv',
                                                           format='csv',
                                                           fields=data_fields,
                                                           skip_header=True)
        # build the vocab over the train set only
        sent_field.build_vocab(train_dataset)
        tag_field.build_vocab(train_dataset)
        char_to_ix = get_char_to_ix(train_dataset)
        train_iter = to_iter(train_dataset, sent_field.vocab.stoi['<pad>'], batch_size)
        val_iter = to_iter(val_dataset, sent_field.vocab.stoi['<pad>'], 1)
        return train_iter, val_iter, sent_field.vocab, tag_field.vocab, char_to_ix
    elif mode == TEST:
        sent_field.vocab = word_vocab
        tag_field.vocab = tag_vocab
        test_dataset = TabularDataset(path=os.path.join(data_path, 'test.csv'),
                                      format='csv',
                                      fields=data_fields,
                                      skip_header=True)
        test_iter = to_iter(test_dataset, word_to_ix['<pad>'], 1)
        return test_iter
Exemplo n.º 6
0
def read_data_set(file_path, vocab):
    """
    Reads the data set from one of the pre-processed CSVs composed
    of columns `label` and `sentence`.

    Parameters
    ---
    file_path : str
        Path to the CSV file.
    vocab : torchtext.Vocab
        Vocabulary to use.

    Returns
    ---
    X : torch.Tensor[num_labels x num_examples x sen_length]
        Sentences on the dataset grouped by labels.
    y : torch.Tensor[num_labels]
        Labels for each group of sentences.
    """
    sentence = Field(batch_first=True,
                     sequential=True,
                     tokenize=simple_tokenizer)
    sentence.vocab = vocab

    label = Field(is_target=True)
    label.vocab = vocab

    data_set = TabularDataset(path=file_path,
                              format='csv',
                              skip_header=True,
                              fields=[('label', label),
                                      ('sentence', sentence)])

    sentences_tensor = sentence.process(data_set.sentence)
    labels_tensor = label.process(data_set.label).squeeze()

    # Infer num_labels and group sentences by label
    num_labels = labels_tensor.unique().shape[0]
    num_examples = labels_tensor.shape[0] // num_labels
    y = labels_tensor[::num_examples]
    sen_length = sentences_tensor.shape[-1]
    X = sentences_tensor.view(num_labels, num_examples, sen_length)

    return X, y
Exemplo n.º 7
0
def load_data_dict(experiment_name,
                   langs,
                   corpora_type,
                   args,
                   device,
                   src_field=None,
                   trg_field=None):
    if src_field == None or trg_field == None:
        src_field = Field(tokenize=str.split,
                          unk_token=UNK_WORD,
                          pad_token=PAD_WORD,
                          init_token=BOS_WORD,
                          eos_token=EOS_WORD)
        trg_field = Field(tokenize=str.split,
                          unk_token=UNK_WORD,
                          pad_token=PAD_WORD,
                          init_token=BOS_WORD,
                          eos_token=EOS_WORD)
        fields = (src_field, trg_field)
        print('Loading src vocab')
        src_vocab = load_vocab(get_vocab_path(experiment_name, langs[0]))
        src_field.vocab = src_field.vocab_cls(
            src_vocab, specials=[UNK_WORD, PAD_WORD, BOS_WORD, EOS_WORD])
        print('Loading trg vocab')
        trg_vocab = load_vocab(get_vocab_path(experiment_name, langs[1]))
        trg_field.vocab = trg_field.vocab_cls(
            trg_vocab, specials=[UNK_WORD, PAD_WORD, BOS_WORD, EOS_WORD])
        args.src_pad_idx = src_field.vocab.stoi[PAD_WORD]
        args.trg_pad_idx = trg_field.vocab.stoi[PAD_WORD]
        args.trg_bos_idx = trg_field.vocab.stoi[BOS_WORD]
        args.trg_eos_idx = trg_field.vocab.stoi[EOS_WORD]
        args.src_vocab_size = len(src_field.vocab)
        args.trg_vocab_size = len(trg_field.vocab)

    print('Loading data')
    data, total_tokens = load_data(experiment_name=experiment_name,
                                   langs=langs,
                                   fields=fields,
                                   batch_size=args.batch_size,
                                   device=device,
                                   corpora_type=corpora_type,
                                   reduce_size=args.data_reduce_size)
    return data, total_tokens, src_field, trg_field
Exemplo n.º 8
0
def make_fields(vocab_count, binary=True):
    text_field = Field(batch_first=True,
                       include_lengths=True,
                       tokenize=lambda x: x.split(' '))
    text_field.vocab = Vocab(vocab_count['text'])
    char_nesting_field = Field(batch_first=True, tokenize=list)
    char_field = NestedField(char_nesting_field,
                             tokenize=lambda x: x.split(' '))
    char_nesting_field.vocab = Vocab(vocab_count['chars'])
    char_field.vocab = Vocab(vocab_count['chars'])
    pos1_field = Field(batch_first=True, sequential=False, use_vocab=False)
    pos2_field = Field(batch_first=True, sequential=False, use_vocab=False)
    pos1_rel_field = Field(sequential=True, batch_first=True)
    pos1_rel_field.vocab = Vocab(vocab_count['pos1_rel'])
    pos2_rel_field = Field(sequential=True, batch_first=True)
    pos2_rel_field.vocab = Vocab(vocab_count['pos2_rel'])
    if binary:
        label_field = Field(sequential=False, batch_first=True)
    else:
        label_field = Field(sequential=False, batch_first=True)
    label_field.vocab = Vocab(vocab_count['relation'], specials=[])
    reltype_field = Field(batch_first=True, sequential=False)
    reltype_field.vocab = Vocab(vocab_count['rel_type'])
    fields_dict = {
        'text': [('text', text_field), ('chars', char_field)],
        'pos1': ('pos1', pos1_field),
        'pos2': ('pos2', pos2_field),
        'pos1_rel': ('pos1_rel', pos1_rel_field),
        'pos2_rel': ('pos2_rel', pos2_rel_field),
        'relation': ('relation', label_field),
        'rel_type': ('rel_type', reltype_field)
    }
    return fields_dict
Exemplo n.º 9
0
    def __init__(self, module_name, train_bs, eval_bs, device, log):
        self.module_name = module_name

        # split_chars = lambda x: list("".join(x.split()))
        split_chars = lambda x: list(x)  # keeps whitespaces

        source = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        target = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        log("Loading FULL datasets ...")
        folder = os.path.join(DATASET_TARGET_DIR, module_name)
        train_dataset, eval_dataset, _ = TranslationDataset.splits(
            path=folder,
            root=folder,
            exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING),
            fields=(source, target),
            train=TRAIN_FILE_NAME,
            validation=EVAL_FILE_NAME,
            test=EVAL_FILE_NAME)

        log("Building vocab ...")
        source.build_vocab(train_dataset)
        target.vocab = source.vocab

        log("Creating iterators ...")
        train_iterator = Iterator(dataset=train_dataset,
                                  batch_size=train_bs,
                                  train=True,
                                  repeat=True,
                                  shuffle=True,
                                  device=device)

        eval_iterator = Iterator(dataset=eval_dataset,
                                 batch_size=eval_bs,
                                 train=False,
                                 repeat=False,
                                 shuffle=False,
                                 device=device)

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.train_iterator = train_iterator
        self.eval_iterator = eval_iterator
        self.source = source
        self.target = target
Exemplo n.º 10
0
def load_naive_cl(args):
    """
    Convenience function to load pickle or dataset
    """
    if args.tokenizer == 'spacy':
        maslow_text = Field(tokenize=tokenize_en,
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True,
                            include_lengths=True,
                            use_vocab=True)
        reiss_text = maslow_text

    elif args.tokenizer == 'raw':
        maslow_text = Field(tokenize=tokenize_raw,
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True,
                            include_lengths=True,
                            use_vocab=True)
        reiss_text = maslow_text

    elif args.tokenizer == 'gpt2':
        maslow_text = args.gpt_maslowfield
        reiss_text = args.gpt_reissfield

    # Maslow dataset
    maslow_label = Field(sequential=False, unk_token=None)
    maslow_path = ".data/stories/story_commonsense/torchtext_class/maslow/"
    maslow_iterators= \
        load_naive_iterators(args, maslow_path, fields=(maslow_text, maslow_label))

    # Reiss dataset
    reiss_label = Field(sequential=False, unk_token=None)
    reiss_path = ".data/stories/story_commonsense/torchtext_class/reiss/"
    reiss_iterators= \
            load_naive_iterators(args, reiss_path, fields=(reiss_text, reiss_label))

    # Load vocab used for previous model from pickle
    print(f"Found data pickle, loading from {args.prepared_data}")
    with open(args.prepared_data, 'rb') as p:
        d = pickle.load(p)
        combined_vocab = d["combined_vocab"]
        args.emb_dim = d["emb_dim"]
        loaded_vectors = d["loaded_vectors"]

    maslow_text.vocab = combined_vocab
    reiss_text.vocab = combined_vocab

    return maslow_iterators, reiss_iterators, maslow_text, loaded_vectors
Exemplo n.º 11
0
 def build_field_vocab(cls,
                       field: Field,
                       counter: Counter,
                       size_multiple: int = 1,
                       **kwargs) -> NoReturn:
     # PN: original name was _build_field_vocab
     # this is basically copy-pasted from torchtext.
     all_specials = [
         field.unk_token, field.pad_token, field.init_token, field.eos_token
     ]
     specials = [tok for tok in all_specials if tok is not None]
     field.vocab = field.vocab_cls(counter, specials=specials, **kwargs)
     if size_multiple > 1:
         cls.pad_vocab_to_multiple(field.vocab, size_multiple)
     return
Exemplo n.º 12
0
def predict(sentence, model_path):
    if not os.path.exists(model_path):
        raise Exception("Need to provide model path")
    model = Model(model_path)
    checkpoint = torch.load(model_path,
                            map_location=lambda storage, location: storage)
    vocab = checkpoint['vocab']

    target_field = Field(sequential=True,
                         init_token=START_DECODING,
                         eos_token=STOP_DECODING,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field = Field(sequential=True,
                         init_token=SENTENCE_START,
                         eos_token=SENTENCE_END,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field.vocab = vocab
    target_field.vocab = vocab
    data = [{'src': sentence, 'tgt': ''}]
    predict_data = Mydataset(data=data,
                             fields=(('source', source_field), ('target',
                                                                target_field)))

    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)
Exemplo n.º 13
0
def create_dataset(config: Config,
                   device: torch.device,
                   vocab: Vocab,
                   rics: List[str],
                   seqtypes: List[SeqType]) -> Iterator:

    fields = dict()
    fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, RawField())

    time_field = Field(use_vocab=False, batch_first=True, sequential=False)
    fields['jst_hour'] = (SeqType.Time.value, time_field)

    token_field = \
        Field(use_vocab=True,
              init_token=SpecialToken.BOS.value,
              eos_token=SpecialToken.EOS.value,
              pad_token=SpecialToken.Padding.value,
              unk_token=SpecialToken.Unknown.value)

    fields['processed_tokens'] = (SeqType.Token.value, token_field)

    tensor_type = torch.FloatTensor if device.type == 'cpu' else torch.cuda.FloatTensor

    for (ric, seqtype) in itertools.product(rics, seqtypes):
        n = N_LONG_TERM if seqtype.value.endswith('long') else N_SHORT_TERM
        price_field = Field(use_vocab=False,
                            fix_length=n,
                            batch_first=True,
                            pad_token=0.0,
                            preprocessing=lambda xs: [float(x) for x in xs],
                            tensor_type=tensor_type)
        key = stringify_ric_seqtype(ric, seqtype)
        fields[key] = (key, price_field)

    # load an alignment for predicttion
    predict = TabularDataset(path='output/alignment-predict.json',
                             format='json',
                             fields=fields)

    token_field.vocab = vocab

    # Make an iteroter for prediction
    return Iterator(predict,
                    batch_size=1,
                    device=-1 if device.type == 'cpu' else device,
                    repeat=False,
                    sort=False)
    def __init__(self, data_file, vocab_file, batch_size=256):
        self.batch_size = batch_size

        smi_field = Field(sequential=True,
                          init_token='<sos>',
                          eos_token=' ',
                          pad_token=' ',
                          include_lengths=True,
                          batch_first=True,
                          tokenize=smi_tokenizer)
        property_field = Field(sequential=False, use_vocab=False)
        # load smile data
        with open(data_file, 'r') as f:
            mol_strs = f.read().strip().split('\n')
            mol_strs = [mol.replace(' ', '') for mol in mol_strs]
        mol_strs = [smi_field.preprocess(mol) for mol in mol_strs]
        smi_examples = []
        fields = [('smile', smi_field), ('property', property_field)]
        for mol in mol_strs:
            ex = Example.fromlist([mol, [1, 2, 3]], fields)
            smi_examples.append(ex)

        # load or build vocab
        if os.path.isfile(vocab_file):
            print('load vocab from:', vocab_file)
            smi_field.vocab = pickle.load(open(vocab_file, 'rb'))
        else:
            print('build and save vocab file:', vocab_file)
            smi_field.build_vocab(mol_strs)
            pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2)

        self.vocab = smi_field.vocab
        self.vocab_size = len(smi_field.vocab.itos)
        self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token]
        self.sos_idx = smi_field.vocab.stoi[smi_field.init_token]
        self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token]
        self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token]

        self.dataset_smi = Dataset(smi_examples, fields=fields)
        self.train_smi = Dataset(smi_examples[:-5000], fields=fields)
        self.test_smi = Dataset(smi_examples[-5000:], fields=fields)
def test_loader(path):
    with open("Data/text.pickle", "rb") as fp:
        vocab = pickle.load(fp)
    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
    TEXT.vocab = vocab
    tst_datafields = [("titles", TEXT)]
    tst = TabularDataset(path='Data/test.csv',
                         format='csv',
                         skip_header=True,
                         fields=tst_datafields)

    test_iter = Iterator(tst,
                         batch_size=32,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False)

    test_dl = BatchWrapper(test_iter, "titles", None)

    return test_dl, tst
Exemplo n.º 16
0
def preprocess_couplet():
    SRC = Field(include_lengths=True,
                init_token="<sos>",
                eos_token="<eos>",
                pad_token="<pad>",
                unk_token="<unk>",
                lower=True,
                batch_first=False,
                tokenize=lambda text: text.split())
    TRG = Field(include_lengths=True,
                init_token="<sos>",
                eos_token="<eos>",
                pad_token="<pad>",
                unk_token="<unk>",
                lower=True,
                batch_first=False,
                tokenize=lambda text: text.split())
    _train, _test = TabularDataset.splits(path="data/couplet", root="data", train="train.tsv", test="test.tsv",
                                    format='csv', skip_header=False, fields=[("src", SRC), ("trg", TRG)],
                                    csv_reader_params={"quoting": csv.QUOTE_NONE, "delimiter": "\t"})
    SRC.build_vocab(_train.src, _train.trg, min_freq=1)
    TRG.vocab = SRC.vocab
    return _train, _test, SRC, TRG
Exemplo n.º 17
0
def init_lm(config_path, state_path, model_cls_name: str):
    model_cls = MODEL_CLASSES[model_cls_name]
    hp = load_config(config_path).get('hp')
    get_path = create_get_path_fn(state_path)

    # Loading vocab
    field = Field(eos_token=EOS_TOKEN,
                  batch_first=True,
                  tokenize=char_tokenize,
                  pad_first=True)
    field.vocab = pickle.load(open(get_path('vocab', 'pickle'), 'rb'))

    print('Loading models..')
    device = None if torch.cuda.is_available() else 'cpu'

    if model_cls is RNNLM:
        lm = cudable(RNNLM(hp.model_size, field.vocab,
                           n_layers=hp.n_layers)).eval()
        lm.load_state_dict(torch.load(get_path('lm'), map_location=device))
    elif model_cls is ConditionalLM:
        lm = cudable(ConditionalLM(hp.model_size, field.vocab)).eval()
        lm.load_state_dict(torch.load(get_path('lm'), map_location=device))
    elif model_cls is CharLMFromEmbs:
        rnn_lm = cudable(
            RNNLM(hp.model_size, field.vocab, n_layers=hp.n_layers))
        style_embed = cudable(nn.Embedding(2, hp.model_size))

        rnn_lm.load_state_dict(torch.load(get_path('lm'), map_location=device))
        style_embed.load_state_dict(
            torch.load(get_path('style_embed'), map_location=device))

        lm = cudable(CharLMFromEmbs(rnn_lm, style_embed,
                                    n_layers=hp.n_layers)).eval()
    else:
        raise NotImplementedError

    return lm, field
Exemplo n.º 18
0
def load_dataset(config, device):

    label_dict = {"observing": 0, "against": 1, "for": 2}
    LABEL = Field(use_vocab = False, sequential = False,\
     dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()])

    SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = True)
    SENT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = False)

    DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \
     preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\
     include_lengths = True)

    fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\
     ('abst', SEQ), ('body', DOC)]

    train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\
     fields = fields, train = config.train_file, test = config.test_file)
    train, val = train.split(split_ratio=0.80)

    vectors = GloVe(name="6B",
                    dim=config.embed_dim,
                    cache='/users4/jwduan/vectors/')
    DOC.build_vocab(train, val, test, vectors=vectors)

    SEQ.build_vocab()
    SEQ.vocab = DOC.vocab

    config.vocab_size = len(DOC.vocab)
    train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\
     batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True,
      device = device, shuffle = True, repeat = False)

    return (train_loader, val_loader, test_loader), DOC.vocab.vectors
Exemplo n.º 19
0
def reformat_data(data,
                  data_torchaudio,
                  trg_min_freq,
                  trg_max_size,
                  tok_fun,
                  trg_vocab_file=None,
                  trg_vocab=None,
                  lowercase=True):
    train_iter = data

    src_field = Noprocessfield(sequential=False,
                               use_vocab=False,
                               dtype=torch.double,
                               include_lengths=True)
    trg_field = Field(init_token=BOS_TOKEN,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      unk_token=UNK_TOKEN,
                      batch_first=True,
                      lower=lowercase,
                      include_lengths=True)
    if trg_vocab is None:
        trg_vocab = build_vocab(min_freq=trg_min_freq,
                                max_size=trg_max_size,
                                dataset=data_torchaudio,
                                trg_field=trg_field,
                                vocab_file=trg_vocab_file)
    trg_field.vocab = trg_vocab

    entry_list = []
    for i, batch in enumerate(iter(train_iter)):
        # reactivate training
        entry_list.append(Entry(batch[0][0].squeeze(), batch[0][1]))
    train_data = Dataset(entry_list, [('src', src_field), ('trg', trg_field)])
    return train_data, trg_vocab, src_field, trg_field
Exemplo n.º 20
0
def getData():
    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>",
                   pad_token="<pad>",
                   unk_token="<unk>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>",
                    pad_token="<pad>",
                    unk_token="<unk>")

    print("===============================before ")
    train_data, valid_data, test_data = Multi30k.splits(
        exts=(".ennsw", ".en"),
        fields=(german, english),
        # root='.data',
        train='train',
        validation='val',
        test='test2016',
        path='.data/multi30k')

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".tgtnsw", ".tgt"), fields=(german, english),
    #     # root='.data',
    #     train='train',
    #     validation='valid',
    #     test='test',
    #     path='/data/chaudhryz/uwstudent1/data_zaid_short'
    # )

    #The study’s questions are carefully worded and chosen.
    # The study questions were carefully worded and chosen.

    # train_data, valid_data, test_data = Multi30k.splits(
    #     exts=(".src", ".tgt"), fields=(german, english),
    #     # root='.data',
    #     train='train',
    #     validation='valid',
    #     test='test',
    #     path = '/data/chaudhryz/uwstudent1/GDATA'
    # )

    #german.build_vocab(train_data, max_size=10000, min_freq=2)
    #english.build_vocab(train_data, max_size=10000, min_freq=2)

    #german.vocab.init_token = "<sos>"
    #german.vocab.eos_token = "<eos>"

    #english.vocab.init_token = "<sos>"
    #english.vocab.eos_token = "<eos>"
    # print("Train")
    # for i in range(10):
    #     #print(train_data[i].src, train_data[i].trg)
    #     printSent(train_data[i].src)
    #     printSent(train_data[i].trg)

    # print("Test")
    # for i in range(10):
    #     #print(train_data[i].src, train_data[i].trg)
    #     printSent(test_data[i].src)
    #     printSent(test_data[i].trg)
    # exit()

    # a = {'GermanVocab': german.vocab, 'EnglishVocab': english.vocab}

    # with open('filename.pickle', 'wb') as handle:
    #     pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
    #
    with open('filename.pickle', 'rb') as handle:
        b = pickle.load(handle)

    german.vocab = b['GermanVocab']
    english.vocab = b['EnglishVocab']

    #
    # print
    # a == b

    return german.vocab, english.vocab, train_data, valid_data, test_data
Exemplo n.º 21
0
                target.append(labels[file[:-4]])

    return list(zip(sentences, target))


# 定义Field
TEXT = Field(sequential=True,
             tokenize=lambda x: jb.lcut(x),
             lower=True,
             use_vocab=True)
LABEL = Field(sequential=False, use_vocab=False)
FIELDS = [('text', TEXT), ('category', LABEL)]

# 构建中文词汇表
with open("vocab.pkl", 'rb') as vocab:
    TEXT.vocab = pickle.load(vocab)
# ----------------------------- 请加载您最满意的模型 -------------------------------
# 加载模型(请加载你认为的最佳模型)
# 加载模型,加载请注意 model_path 是相对路径, 与当前文件同级。
# 如果你的模型是在 results 文件夹下的 temp.pth 模型,则 model_path = 'results/temp.pth'

# 创建模型实例
vocab_size = len(TEXT.vocab)
model = Net(vocab_size)
model_path = "results/model.pth"
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# -------------------------请勿修改 predict 函数的输入和输出-------------------------


def predict(text):
Exemplo n.º 22
0
def train_data():
    tokenize = lambda x: x.split()

    Text_src = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    Answer = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    Text_tgt = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',
                     include_lengths=True, init_token='<SOS>', lower=True)


    trn_datafields = [("source",Text_src),
                    ("target", Text_tgt),
                    ("answer", Answer)]
    trn, val = TabularDataset.splits(
        path="../data/"+str(data_name), # the root directory where the data lies
        train='train.json', validation = 'validation.json',
        format='json',
        # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'source': trn_datafields[0], 'target': trn_datafields[1], 'answer': trn_datafields[2]})

    # Text_src.build_vocab(trn, max_size=vocab_size)
    Text_src.build_vocab(trn, max_size=src_vocab_size)
    Text_tgt.build_vocab(trn, max_size=tgt_vocab_size)
    Answer.build_vocab(trn)
    Text_src.vocab.load_vectors("glove.840B.300d")
    Text_tgt.vocab.load_vectors("glove.840B.300d")

    train_iter, val_iter = BucketIterator.splits(
            (trn, val), # we pass in the datasets we want the iterator to draw data from
            batch_sizes= (batch_size, batch_size),
            device=-1, # if you want to use the GPU, specify the GPU number here
            sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data.
            sort_within_batch=True,
            shuffle = True,
            repeat= False)


    Text_tgt_r = ReversibleField(sequential=True, include_lengths=True,
                                 eos_token='<EOS>', init_token='<SOS>', lower=True)
    Text_tgt_r.vocab = Text_tgt.vocab

    Text_src_r = ReversibleField(sequential=True, include_lengths=True,
                                 eos_token='<EOS>', lower=True)
    Text_src_r.vocab = Text_src.vocab

    Text_ans_r = ReversibleField(sequential=True, tokenize=tokenize,
                   eos_token='<EOS>', include_lengths=True, lower=True)
    Text_ans_r.vocab = Answer.vocab

    src_pad = Text_src.vocab.stoi['<pad>']
    src_unk = Text_src.vocab.stoi['<unk>']
    src_eos = Text_src.vocab.stoi['<EOS>']
    src_special = [src_pad, src_unk, src_eos]

    ans_pad = Answer.vocab.stoi['<pad>']
    ans_unk = Answer.vocab.stoi['<unk>']
    ans_eos = Answer.vocab.stoi['<EOS>']
    ans_special = [ans_pad, ans_unk, ans_eos]

    tgt_pad = Text_tgt.vocab.stoi['<pad>']
    tgt_unk = Text_tgt.vocab.stoi['<unk>']
    tgt_eos = Text_tgt.vocab.stoi['<EOS>']
    tgt_sos = Text_tgt.vocab.stoi['<SOS>']
    tgt_special = [tgt_pad, tgt_unk, tgt_eos, tgt_sos]


    # discriminator data iterator
    passage = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    ans = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True)
    ques = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',include_lengths=True, lower=True)
    target = Field(sequential=False, use_vocab=False)

    disc_trn_datafields = [("question", ques),
                      ("answer", ans),
                      ("passage", passage),
                      ("target", target)]

    disc_trn = TabularDataset(
        path="../data/" + str(data_name) + "/disc.json",  # the root directory where the data lies
        # train='disc.json',
        format='json',
        # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'question': disc_trn_datafields[0], 'answer': disc_trn_datafields[1], 'passage': disc_trn_datafields[2], 'target': disc_trn_datafields[3]})

    passage.vocab = Text_src.vocab
    ans.vocab = Answer.vocab
    ques.vocab = Text_tgt.vocab

    disc_train_iter = BucketIterator(
        dataset=disc_trn,  # we pass in the datasets we want the iterator to draw data from
        batch_size = batch_size,
        device=-1,  # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.question),
        # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=True,
        shuffle=True,
        repeat=False)



    # raw data iterator
    Text_tgt_raw = ReversibleField(sequential=True, tokenize=tokenize, include_lengths=True, lower=True)

    trn_datafields = [("source", Text_tgt_raw),
                      ("target", Text_tgt_raw)]
    trn_raw, val_raw = TabularDataset.splits(
        path="../data/"+str(data_name),  # the root directory where the data lies
        train='train.json', validation='validation.json',
        format='json',
        # skip_header=True,
        # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields={'source': trn_datafields[0], 'target': trn_datafields[1]})


    Text_tgt_raw.build_vocab(val_raw)

    train_iter_raw, val_iter_raw = BucketIterator.splits(
        (trn_raw, val_raw),  # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_size, batch_size),
        device=-1,  # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.source),
        # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=True,
        shuffle=True,
        repeat=False)


    return train_iter, val_iter, src_special, tgt_special, Text_tgt_r, val_iter_raw, Text_tgt_raw, Text_src_r,\
           Text_src, Text_tgt, ans_special, Text_ans_r, disc_train_iter
Exemplo n.º 23
0
    def __init__(self,
                 path,
                 batch_size,
                 extensions=(".src", ".trg"),
                 src_vocab: Vocab = Ref("model.src_vocab"),
                 trg_vocab: Vocab = Ref("model.trg_vocab"),
                 level=Ref("model.level"),
                 sort=False,
                 sort_within_batch=False,
                 batch_by_words=True,
                 batch_first=Ref("model.batch_first", True),
                 multiple: int = Ref("exp_global.multiple", 1),
                 max_len=1000,
                 subword_model=None,
                 subword_alpha=0.1,
                 subword_nbest=64):
        self.max_len = max_len
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        tokenize = None
        if level != "word":
            tokenize = list

        if subword_model is not None:
            import sentencepiece as spm
            self.subword_model = spm.SentencePieceProcessor()
            self.subword_model.load(subword_model)
            tokenize = self.split_subwords
        else:
            self.subword_model = None
        self.subword_alpha = subword_alpha
        self.subword_nbest = subword_nbest

        logger.info(f"Loading {path}")
        src = Field(batch_first=batch_first,
                    tokenize=tokenize,
                    include_lengths=True,
                    preprocessing=None,
                    postprocessing=self.postprocess_src)
        src.vocab = src_vocab
        if os.path.exists(os.path.expanduser(path + extensions[1])):
            has_target = True
            trg = Field(batch_first=batch_first,
                        tokenize=tokenize,
                        include_lengths=True,
                        init_token=src_vocab.bos_token,
                        eos_token=trg_vocab.eos_token,
                        is_target=True,
                        preprocessing=None,
                        postprocessing=self.postprocess_trg)
            trg.vocab = trg_vocab
            fields = [('src', src), ('trg', trg)]

            TorchTextDataset.__init__(
                self,
                self.load_parallel_data(
                    os.path.expanduser(path + extensions[0]),
                    os.path.expanduser(path + extensions[1]), fields), fields)
        else:
            has_target = False
            fields = [('src', src)]

            TorchTextDataset.__init__(
                self,
                self.load_source_data(os.path.expanduser(path + extensions[0]),
                                      fields[0]), fields)
        BaseTranslationDataset.__init__(self, batch_size, level, sort,
                                        sort_within_batch, batch_by_words,
                                        batch_first, multiple, has_target)
Exemplo n.º 24
0
# %%
# region Dataset & DataLoader
dataset = Dataset(examples, fields)
train, valid, test = dataset.split(split_ratio=[0.6, 0.2, 0.2],
                                   stratified=False,
                                   strata_field='label')
# vocab
vectors = GloVe(name='6B', dim=300)
source = [
    getattr(dataset, item)
    for item in ['title', 'abstr', 'intro', 'relat', 'metho', 'concl']
]
TITLE.build_vocab(source, vectors=vectors, max_size=opt.vocab_size)
TITLE.vocab.vectors.unk_init = init.xavier_uniform
ABSTR.vocab = TITLE.vocab
INTRO.vocab = TITLE.vocab
RELAT.vocab = TITLE.vocab
METHO.vocab = TITLE.vocab
CONCL.vocab = TITLE.vocab
AUTHO.build_vocab(train, max_size=1600)

# Iterator
if not opt.notrain:
    train_iter, valid_iter = BucketIterator.splits((train, valid),
                                                   batch_size=opt.batch_size,
                                                   sort=False)
test_iter = BucketIterator(test,
                           batch_size=opt.batch_size,
                           sort=False,
                           train=False,
Exemplo n.º 25
0
    def __init__(self,
                 data_file,
                 vocab_file,
                 batch_size=256,
                 property_column=None):
        self.batch_size = batch_size

        smi_field = Field(sequential=True,
                          init_token='<sos>',
                          eos_token=' ',
                          pad_token=' ',
                          include_lengths=True,
                          batch_first=True,
                          tokenize=smi_tokenizer)
        property_field = Field(sequential=False,
                               use_vocab=False,
                               dtype=torch.float)
        # load smile data
        # with open(data_file, 'r') as f:
        #     mol_strs = f.read().strip().split('\n')
        #     mol_strs = [mol.replace(' ', '') for mol in mol_strs]
        # mol_strs = [smi_field.preprocess(mol) for mol in mol_strs]
        mol_strs = []
        smi_examples = []
        fields = [('smile', smi_field), ('property', property_field)]
        for index, row in data_file.iterrows():
            mol_str = smi_field.preprocess(row['smiles'])
            # prop_str = property_field.preprocess(row[property_column].tolist())
            if property_column is not None:
                ex = Example.fromlist([mol_str, row[property_column].tolist()],
                                      fields)
            else:
                ex = Example.fromlist([mol_str, [0]], fields)
            mol_strs.append(mol_str)
            smi_examples.append(ex)

        # load or build vocab
        if os.path.isfile(vocab_file):
            print('load vocab from:', vocab_file)
            smi_field.vocab = pickle.load(open(vocab_file, 'rb'))
        else:
            print('build and save vocab file:', vocab_file)
            smi_field.build_vocab(mol_strs)
            pickle.dump(smi_field.vocab, open(vocab_file, 'wb'), protocol=2)

        self.dset_num = len(mol_strs)
        # self.dset_test_num = int(self.dset_num * 0.2)
        self.vocab = smi_field.vocab
        self.vocab_size = len(smi_field.vocab.itos)
        if property_column is not None:
            self.prop_size = len(property_column)
        else:
            self.prop_size = 0
        self.padding_idx = smi_field.vocab.stoi[smi_field.pad_token]
        self.sos_idx = smi_field.vocab.stoi[smi_field.init_token]
        self.eos_idx = smi_field.vocab.stoi[smi_field.eos_token]
        self.unk_idx = smi_field.vocab.stoi[smi_field.unk_token]

        self.dataset_smi = Dataset(smi_examples, fields=fields)
        self.train_smi, self.test_smi = self.dataset_smi.split(0.8)
        self.dset_test_num = len(self.test_smi)
Exemplo n.º 26
0
def load_dataset(hparams, is_eval=False, test_data_path=None):
    batch_size = hparams.batch_size
    max_copy_token_num = hparams.max_copy_token_num
    pointer_copy_tokens = hparams.max_copy_token_num if hparams.copy else 0

    def tokenize(text):
        return text.strip('\r').split

    src_field = Field(tokenize=tokenize,
                      include_lengths=True,
                      init_token='<ssos>',
                      eos_token='<seos>')
    tgt_field = Field(tokenize=tokenize,
                      include_lengths=True,
                      init_token='<sos>',
                      eos_token='<eos>')
    fields = [('src', src_field), ('tgt', tgt_field)]

    if not hparams.share_vocab:
        logger.info('[VOCAB] Constructing two vocabs for the src and tgt')
        logger.info('[VOCAB] Loading src vocab from: %s' %
                    hparams.src_vocab_path)
        load_vocab(hparams.src_vocab_path, src_field)
        logger.info('[VOCAB] src vocab size: %d' % len(src_field.vocab.itos))
        logger.info('[VOCAB] Loading tgt vocab from: %s' %
                    hparams.tgt_vocab_path)
        load_vocab(hparams.tgt_vocab_path,
                   tgt_field,
                   pointer_copy_tokens=pointer_copy_tokens)
        logger.info('[VOCAB] tgt vocab size: %d' % len(tgt_field.vocab.itos))
    else:
        logger.info('[VOCAB] Constructing a sharing vocab for the src and tgt')
        logger.info('[VOCAB] Loading src&tgt vocab from: %s' %
                    hparams.src_vocab_path)
        load_vocab(hparams.vocab_path,
                   src_field,
                   pointer_copy_tokens=pointer_copy_tokens,
                   special_tokens=[
                       tgt_field.unk_token, tgt_field.pad_token,
                       tgt_field.init_token, tgt_field.eos_token
                   ])
        tgt_field.vocab = src_field.vocab
        logger.info('[VOCAB] src vocab size: %d' % len(src_field.vocab.itos))
        logger.info('[VOCAB] tgt vocab size: %d' % len(tgt_field.vocab.itos))

    def sort_key(x):
        return len(x.tgt) + len(x.src) * 100

    device = 'cuda' if hparams.cuda else 'cpu'

    val, max_val_len = get_dataset(hparams.val_data_path_prefix,
                                   fields=fields,
                                   max_src_len=hparams.max_src_len,
                                   max_tgt_len=hparams.max_tgt_len,
                                   pointer_copy=hparams.copy,
                                   word_freq_dict=src_field.vocab.stoi)
    test, max_test_len = get_dataset(hparams.test_data_path_prefix
                                     if not test_data_path else test_data_path,
                                     fields=fields,
                                     pointer_copy=hparams.copy,
                                     max_src_len=hparams.max_src_len,
                                     max_tgt_len=hparams.max_tgt_len,
                                     word_freq_dict=src_field.vocab.stoi)

    if hparams.copy:
        assert max_val_len + 1 < max_copy_token_num, max_val_len
        assert max_test_len + 1 < max_copy_token_num, max_test_len

    if not is_eval:
        logger.info('[DATASET] Training Mode')
        train, max_train_len = get_dataset(hparams.train_data_path_prefix,
                                           fields=fields,
                                           pointer_copy=hparams.copy,
                                           max_src_len=hparams.max_src_len,
                                           max_tgt_len=hparams.max_tgt_len,
                                           word_freq_dict=src_field.vocab.stoi)
        if hparams.copy:
            assert max_train_len + 1 < max_copy_token_num, max_train_len
        train_iter = BucketIterator(train,
                                    batch_size=batch_size,
                                    repeat=False,
                                    shuffle=True,
                                    sort_key=sort_key,
                                    sort=False,
                                    train=True,
                                    sort_within_batch=True,
                                    device=device)
        val_iter = BucketIterator(val,
                                  batch_size=batch_size,
                                  repeat=False,
                                  shuffle=True,
                                  sort_key=sort_key,
                                  sort=False,
                                  train=False,
                                  sort_within_batch=True,
                                  device=device)
        test_iter = Iterator(test,
                             batch_size=batch_size,
                             repeat=False,
                             shuffle=False,
                             sort_key=sort_key,
                             sort=False,
                             train=False,
                             sort_within_batch=False,
                             device=device)
        return train_iter, val_iter, test_iter, src_field, tgt_field
    else:
        logger.info('[DATASET] Eval/Inference Mode')
        val_iter = Iterator(val,
                            batch_size=batch_size,
                            repeat=False,
                            shuffle=False,
                            sort_key=sort_key,
                            sort=False,
                            train=False,
                            sort_within_batch=False,
                            device=device)
        test_iter = Iterator(test,
                             batch_size=batch_size,
                             repeat=False,
                             shuffle=False,
                             sort_key=sort_key,
                             sort=False,
                             train=False,
                             sort_within_batch=False,
                             device=device)
        return None, val_iter, test_iter, src_field, tgt_field
Exemplo n.º 27
0
                                             validation=val_path,
                                             test=test_path,
                                             format='tsv',
                                             fields=data_fields)
    #TR.build_vocab(train, min_freq=MIN_FREQ, max_size=params.vocab_size)
    #EN.build_vocab(train, min_freq=MIN_FREQ, max_size=params.vocab_size)

print("Building vocab...")

MIN_FREQ = 1
if USE_NEW_DOUBLE_TR and args.bpe:
    TR_CONTEXT.build_vocab(train.src,
                           train.src_context,
                           min_freq=MIN_FREQ,
                           max_size=VOCAB_SIZE)
    TR_SRC.vocab = TR_CONTEXT.vocab
    TR = TR_SRC
else:
    TR.build_vocab(train, min_freq=MIN_FREQ, max_size=VOCAB_SIZE)

EN.build_vocab(train, min_freq=MIN_FREQ, max_size=VOCAB_SIZE)
pad_idx = EN.vocab.stoi[PAD]

print('making validation iterator')
valid_iter = Iterator(val,
                      batch_size=BATCH_SIZE,
                      device=device,
                      repeat=False,
                      sort=False,
                      train=False)
print('done')
Exemplo n.º 28
0
import json
import string
from pathlib import Path

from torchtext.data import Field, RawField
import numpy as np

from utils.entities_list import Entities_list
from utils.class_utils import keys_vocab_cls, iob_labels_vocab_cls, entities_vocab_cls

MAX_BOXES_NUM = 130  # limit max number boxes of every documents
MAX_TRANSCRIPT_LEN = 70  # limit max length text of every box

# text string label converter
TextSegmentsField = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
TextSegmentsField.vocab = keys_vocab_cls
# iob string label converter
IOBTagsField = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
IOBTagsField.vocab = iob_labels_vocab_cls


class Document:
    def __init__(self, boxes_and_transcripts_file: Path, image_file: Path,
                 resized_image_size: Tuple[int, int] = (560,784),
                 iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True,
                 image_index=None, max_boxes_num = MAX_BOXES_NUM, max_transcript_len = MAX_TRANSCRIPT_LEN):
        '''

        :param boxes_and_transcripts_file: gt or ocr results file
        :param image_file: whole images file
        :param resized_image_size: resize whole image size, (w, h)
Exemplo n.º 29
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output file
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    logger = make_logger()

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=test_data, batch_size=batch_size,
            batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric="",
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha, logger=logger)
        return hypotheses

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("eval_batch_size",
                                     cfg["training"].get("batch_size", 1))
    batch_type = cfg["training"].get(
        "eval_batch_type", cfg["training"].get("batch_type", "sentence"))
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["data"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    tok_fun = lambda s: list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, <2: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 1)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 1
        beam_alpha = -1

    if not sys.stdin.isatty():
        # input file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            # write to outputfile if given
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s.", output_path_set)
        else:
            # print to stdout
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        batch_type = "sentence"
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
Exemplo n.º 30
0
    def __init__(self,
                 module_name,
                 train_bs,
                 eval_bs,
                 device,
                 vocab=None,
                 base_folder=None,
                 train_name=None,
                 eval_name=None,
                 x_ext=None,
                 y_ext=None,
                 tokens=None,
                 specials=None,
                 tokenizer=None,
                 sort_within_batch=None,
                 shuffle=None):

        self.module_name = module_name

        # split_chars = lambda x: list("".join(x.split()))
        split_chars = lambda x: list(x)  # keeps whitespaces

        if not tokenizer:
            tokenizer = split_chars

        # NOTE: on Jul-20-2020, removed fix_length=200 since it forces
        # all batches to be of size (batch_size, 200) which
        # really wastes GPU memory
        source = Field(tokenize=tokenizer,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        target = Field(tokenize=tokenizer,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        base_folder = os.path.expanduser(base_folder)

        folder = os.path.join(base_folder, module_name)

        # fix slashes
        folder = os.path.abspath(folder)

        print("loading FULL datasets from folder={}".format(folder))

        train_dataset, eval_dataset, _ = TranslationDataset.splits(
            path=folder,
            root=folder,
            exts=(x_ext, y_ext),
            fields=(source, target),
            train=train_name,
            validation=eval_name,
            test=eval_name)

        if vocab:
            print("Setting vocab to prebuilt file...")
            source.vocab = vocab
            target.vocab = vocab
        elif tokens:
            print("Building vocab from tokens...")
            #source.build_vocab(tokens, specials)
            counter = Counter(tokens)
            source.vocab = source.vocab_cls(counter, specials=specials)
            target.vocab = source.vocab
        else:
            print("Building vocab from TRAIN and EVAL datasets...")
            source.build_vocab(train_dataset, eval_dataset)
            target.vocab = source.vocab

        print("Creating iterators ...")
        do_shuffle = True if shuffle is None else shuffle
        train_iterator = Iterator(dataset=train_dataset,
                                  batch_size=train_bs,
                                  train=True,
                                  repeat=True,
                                  shuffle=do_shuffle,
                                  sort_within_batch=sort_within_batch,
                                  device=device)

        eval_iterator = Iterator(dataset=eval_dataset,
                                 batch_size=eval_bs,
                                 train=False,
                                 repeat=False,
                                 shuffle=False,
                                 sort_within_batch=sort_within_batch,
                                 device=device)

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset

        self.train_iterator = train_iterator
        self.eval_iterator = eval_iterator

        self.source = source
        self.target = target