예제 #1
0
    fields=fields,
    path=filepath,
    separator=' ',
    train='train.txt',
    validation='valid.txt',
    test='test.txt')
BATCH_SIZE = 16
WORD.build_vocab(train_data,
                 vectors='glove.6B.100d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
# print(word_frqs[:20])device

train_iter = data.Iterator(dataset=train_data,
                           batch_size=BATCH_SIZE,
                           device=device,
                           repeat=False,
                           sort_within_batch=True,
                           shuffle=True)
valid_iter = data.Iterator(dataset=valid_data,
                           batch_size=BATCH_SIZE,
                           device=device,
                           repeat=False,
                           sort_within_batch=True,
                           shuffle=False)
test_iter = data.Iterator(dataset=test_data,
                          batch_size=BATCH_SIZE,
                          device=device,
                          repeat=False,
                          sort_within_batch=True,
                          shuffle=False)
예제 #2
0
def test(args):
    train_data, val_data, test_data, SRC, TGT = prepare_data(args)

    BATCH_SIZE = args.batch_size
    best_bleu_loss = 0
    pad_idx = TGT.vocab.stoi["<pad>"]
    print("Size of source vocabulary:", len(SRC.vocab))
    print("Size of target vocabulary:", len(TGT.vocab))

    print("FC matrix:", args.hidden_dim, args.ff_dim)
    model = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                   d_model=args.hidden_dim, d_ff=args.ff_dim,
                                   N=args.num_blocks, compress=args.compress, compress_att=args.compress_attn,
                                   compress_mode=args.compress_mode,
                                   num_compress_enc = args.num_enc_blocks_comp,
                                   num_compress_dec = args.num_dec_blocks_comp,)
    model.to(device)
    if args.load_model:
        print('load model from [%s]' % args.load_model, file=sys.stderr)
        params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
        state_dict = params['model']
        # opts = params['']
        model.load_state_dict(state_dict, strict=False)
    if args.debug:
        #fast check number of parameters
        model_full = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                       d_model=args.hidden_dim, d_ff=args.ff_dim, \
                                       N=6, compress=False, \
                                       num_compress_enc=0,
                                       num_compress_dec=0)
        debug_compress_info(model_full,model)
        # exit()

    criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    criterion.to(device)

    if args.multi_gpu:
        devices = list(np.arange(args.num_devices))
        model_parallel = nn.DataParallel(model, device_ids=devices)

    test_iter = data.Iterator(test_data, batch_size=args.batch_size, train=False, sort=False, repeat=False,
                                  device=device)
    print("Number of examples in test: ", args.batch_size*len([_ for _ in test_iter]))

    # test_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt)

    os.makedirs(args.save_to_file, exist_ok=True)
    if args.multi_gpu:
        model_parallel.eval()
        start_infer_time = time.time()

        bleu_loss = train_utils.test_decode(model_parallel.module, SRC, TGT, test_iter, 10000, \
                                to_words=True,
                               file_path=os.path.join(args.save_to_file, args.exp_name))
        print("Time for inference: ", time.time() - start_infer_time)

    else:
        model.eval()
        bleu_loss = train_utils.test_decode(model, SRC, TGT, test_iter, -1,\
                                            to_words=True,
                                            file_path=os.path.join(args.save_to_file, args.exp_name))
    print()
    # print("Test perplexity ", np.exp(loss))
    print("Total bleu:", bleu_loss)
예제 #3
0
    #############################
    # define iterator
    train_iter = data.BucketIterator(train_data,
                                     batch_size=params['BATCH_SIZE'],
                                     device=DEVICE,
                                     sort_within_batch=True,
                                     sort_key=lambda x: len(x.text),
                                     train=True,
                                     repeat=False)

    # train_iter = data.Iterator(train_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE)

    valid_iter = data.Iterator(valid_data,
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
                               device=DEVICE)

    test_iter = data.Iterator(test_data,
                              batch_size=1,
                              train=False,
                              sort=False,
                              repeat=False,
                              device=DEVICE)

    print_data_info(train_data, valid_data, test_data, SRC, LABEL)

    #############################

    run_lrp(test_iter, vocab=SRC.vocab, model_file='sa_model4.pt')
예제 #4
0
    PAD_INDEX = AMR_SRC.vocab.stoi[PAD_TOKEN]

    print_data_info(my_data, NL_SRC, AMR_SRC)
    train_iter = data.BucketIterator(my_data["train"],
                                     batch_size=BATCH_SIZE,
                                     train=True,
                                     sort_within_batch=True,
                                     sort_key=lambda x:
                                     (len(x.src), len(x.trg)),
                                     repeat=False,
                                     device=DEVICE)

    valid_iter = data.Iterator(my_data["val"],
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
                               device=DEVICE)

    model = make_autoencoder(len(NL_SRC.vocab),
                             len(AMR_SRC.vocab),
                             emb_size=500,
                             hidden_size=500,
                             num_layers=2,
                             dropout=0.5)
    dev_perplexities = train(model,
                             num_epochs=NUM_EPOCHS,
                             print_every=500,
                             num_batches=NUM_BATCHES,
                             error_per=error_per)
    torch.save(model, f'{exp_name}.pt')
예제 #5
0
            TEXT.vocab.vectors[i] = vectors[wv_index]
            match_embedding += 1
        else:
            TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(
                -0.25, 0.25)
else:
    print("Error: Need word embedding pt file")
    exit(1)

print("Embedding match number {} out of {}".format(match_embedding,
                                                   len(TEXT.vocab)))

train_iter = data.Iterator(train,
                           batch_size=args.batch_size,
                           device="cuda",
                           train=True,
                           repeat=False,
                           sort=False,
                           shuffle=True,
                           sort_within_batch=False)
dev_iter = data.Iterator(dev,
                         batch_size=args.batch_size,
                         device="cuda",
                         train=False,
                         repeat=False,
                         sort=False,
                         shuffle=False,
                         sort_within_batch=False)
test_iter = data.Iterator(test,
                          batch_size=args.batch_size,
                          device="cuda",
                          train=False,
예제 #6
0
# BucketIterator
# BPTTIterator
# 若只针对训练集构造迭代器
# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)

# 同时对训练集和验证集进行迭代器的构建
train_iter, valid_iter = data.BucketIterator.splits(
    (train, valid),  # 构建数据集所需的 Dataset
    batch_sizes=(train_batch_size, valid_batch_size),
    device=torch_device,  # 如果使用gpu,此处更换为GPU的编号
    sort_key=lambda x: len(x.sentence),  # 这个BucketIterator需要文本的长度
    sort_within_batch=False,
    repeat=False
)

test_iter = data.Iterator(test, batch_size=test_batch_size, device=torch_device, sort=False, sort_within_batch=False, repeat=False)


from torchtext.vocab import Vectors
import os


cache='../vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
vectors = Vectors(name='glove.6B.'+str(vocab_dimension)+'d.txt', cache=cache)

print("build vocab: start")
TEXT.build_vocab(train, vectors=vectors)

vocab = TEXT.vocab
예제 #7
0
def preprocess(question, equation, lQueryVars, sni_model, fields, use_sni):
    # handle $'s
    question = question.replace('$', ' $ ')
    question = question.replace('. ', ' . ')
    question = question.replace('?', ' ? ')
    question = re.sub(r',([\d\d\d])', r'\1', question)

    # join equations if needed
    equation = ' , '.join(equation)

    # seperate equation at operators
    equation = equation.replace('[', ' ( ')
    equation = equation.replace(']', ' ) ')
    equation = equation.replace('+', ' + ')
    equation = equation.replace('-', ' - ')
    equation = equation.replace('*', ' * ')
    equation = equation.replace('/', ' / ')
    equation = equation.replace('(', ' ( ')
    equation = equation.replace(')', ' ) ')
    equation = equation.replace('=', ' = ')
    equation = equation.replace('^', ' ^ ')

    equation = equation.split()
    question = question.split()

    # prevent inplace changes on question
    question_copy = [t for t in question]
    # prepend and postpend null tokens to question to allow for sni window size
    # of three
    question_copy = ['null', 'null', 'null'
                     ] + question_copy + ['null', 'null', 'null']

    # find and replace constants in question and equation
    i = 0
    constants = dict()
    for j, token in enumerate(question):
        if isFloat(token):
            example = question_copy[j - 3:j + 4]
            ex = data.Example.fromlist([' '.join(example), ''], fields)
            dataset = data.Dataset([ex], fields)
            inp = None
            iterator = data.Iterator(dataset, batch_size=1)
            iterator.repeat = False
            for batch in iterator:
                inp = batch.text.t()  #.cuda()
                #inp = inp.cuda(device=0)

            if (not use_sni) or (use_sni and isSignificant(inp, sni_model)):
                token = float(token)
                character = '[' + chr(97 + i) + ']'
                for symbol in equation:
                    if isFloat(symbol) and float(symbol) == float(token):
                        equation[equation.index(symbol)] = character
                constants[character] = str(token)
                for q in question:
                    if isFloat(q) and float(q) == token:
                        question[question.index(q)] = character
                i += 1

    # find and replace variables in equation
    variables = [
        x for x in equation
        if x not in ['+', '-', '*', '/', ',', '**', '(', ')', '=']
        and not isFloat(x) and not re.match(r'\[[a-z]\]', x)
    ]
    variables = np.unique(variables)
    i = 0
    for v in variables:
        #equation = [x if x!=v else 'VAR_' + str(i) for x in equation]
        equation = [x if x != v else 'VAR' for x in equation]
        #equation = [x if x!=v else '[a]' for x in equation]
        i += 1

    question = ' '.join(question)
    equation = ''.join(equation)

    # simplify equation
    print('equation (before):', equation)
    equation = equation.split(',')
    for i, x in enumerate(equation):
        x = x.replace('[', '')
        x = x.replace(']', '')
        x = x.split('=')
        x = str('(' + str(x[0]) + ')' + '-' + '(' + str(x[1]) + ')')
        parse_expr(x, evaluate=False)
        x = sympy.simplify(x)
        x = str(x)
        x = x.replace(' ', '')

        for k in constants.keys():
            x = x.replace(k.strip('[').strip(']'), k)
        equation[i] = x + '=0'

    equation = sorted(equation)
    equation = ','.join(equation)

    j = 0
    print('EQUATION:', equation)
    constants_in_equation = re.findall(r'\[[a-z]\]', equation)
    print(constants_in_equation)
    for k in sorted(constants_in_equation, reverse=False):
        #equation = equation.replace(k, '[' + chr(107 + j) + ']')
        equation = equation.replace(k, '[]')
        j += 1
        print('EQUATION_:', equation)

    #print('constants:', constants)

    print('equation (after): ', equation)

    return question, equation, constants
예제 #8
0
def main():
    src_dir = "data/src"
    model_dir = "data/model"
    eval_dir = "data/eval"

    corpus = "lang8_small"

    en_emb = "glove"
    de_emb = "glove"

    seq_train = False

    emb_dim = 200
    batch_size = 1500

    # Data Loading
    vocab_file = os.path.join(model_dir, "%s.vocab" % (corpus))
    model_file = os.path.join(
        model_dir, "%s.%s.%s.transformer.pt" % (corpus, en_emb, de_emb))

    if not os.path.exists(eval_dir):
        os.makedirs(eval_dir)

    # Computing Unit
    device = torch.device("cpu")

    # Loading Data
    bos_word = '<s>'
    eos_word = '</s>'

    blank_word = '<blank>'
    min_freq = 2

    spacy_en = spacy.load('en')

    def tokenize(text):
        return [tkn.text for tkn in spacy_en.tokenizer(text)]

    TEXT = data.Field(tokenize=tokenize,
                      init_token=bos_word,
                      eos_token=eos_word,
                      pad_token=blank_word)

    test = datasets.TranslationDataset(path=os.path.join(src_dir, corpus),
                                       exts=('.test.src', '.test.trg'),
                                       fields=(TEXT, TEXT))
    # use the same order as original data
    test_iter = data.Iterator(test,
                              batch_size=batch_size,
                              device=device,
                              sort=False,
                              repeat=False,
                              train=False)

    random_idx = random.randint(0, len(test) - 1)
    print(test[random_idx].src)
    print(test[random_idx].trg)

    # Vocabulary

    TEXT.vocab = torch.load(vocab_file)
    pad_idx = TEXT.vocab.stoi["<blank>"]

    print("Load %s vocabuary; vocab size = %d" % (corpus, len(TEXT.vocab)))

    # Word Embedding

    encoder_emb, decoder_emb = get_emb(en_emb,
                                       de_emb,
                                       TEXT.vocab,
                                       device,
                                       d_model=emb_dim)

    # Translation
    model = BuildModel(len(TEXT.vocab),
                       encoder_emb,
                       decoder_emb,
                       d_model=emb_dim).to(device)
    model.load_state_dict(torch.load(model_file))
    model.eval()

    print("Predicting %s ..." % (corpus))

    src, trg, pred = [], [], []
    for batch in (rebatch(pad_idx, b) for b in test_iter):
        out = greedy_decode(model, TEXT.vocab, batch.src, batch.src_mask)
        # print("SRC OUT: ", src.shape, out.shape)
        probs = model.generator(out)
        _, prediction = torch.max(probs, dim=-1)

        source = [[TEXT.vocab.itos[word] for word in words[1:]]
                  for words in batch.src]
        target = [[TEXT.vocab.itos[word] for word in words[1:]]
                  for words in batch.trg]
        translation = [[TEXT.vocab.itos[word] for word in words]
                       for words in prediction]

        for i in range(len(translation)):
            src.append(' '.join(source[i]).split('</s>')[0])
            trg.append(' '.join(target[i]).split('</s>')[0])
            pred.append(' '.join(translation[i]).split('</s>')[0])

            # eliminate data with unkonwn words in src trg
            if '<unk>' in src[-1] or '<unk>' in trg[-1]:
                continue

            print("Source:", src[-1])
            print("Target:", trg[-1])
            print("Translation:", pred[-1])
            print()

    prefix = os.path.join(eval_dir, '%s.%s.%s.eval' % (corpus, en_emb, de_emb))
    for sentences, ext in zip([src, trg, pred], ['.src', '.trg', '.pred']):
        with open(prefix + ext, 'w+') as f:
            f.write('\n'.join(sentences))
예제 #9
0
    logging.info(
        f'Unique tokens in TARGET vocab: {len(target_variable.vocab)}')

    # Automatically shuffles and buckets the input sequences into
    # sequences of similar length
    train_iter, valid_iter = data.BucketIterator.splits(
        (train_data, valid_data),
        sort_key=lambda x: len(
            x.tweet),  # what function/field to use to group the data
        batch_size=BATCH_SIZE,
        device=device)

    # Don't want to shuffle test data, so use a standard iterator
    dev_iter = data.Iterator(dev_data,
                             batch_size=BATCH_SIZE,
                             device=device,
                             train=False,
                             sort=False,
                             sort_within_batch=False)

    test_iter = data.Iterator(test_data,
                              batch_size=BATCH_SIZE,
                              device=device,
                              train=False,
                              sort=False,
                              sort_within_batch=False)

    emb_shape = text_variable.vocab.vectors.shape
    input_dim = emb_shape[0]
    embedding_dim = emb_shape[1]
    output_dim = 1
    pretrained_embeddings = text_variable.vocab.vectors
예제 #10
0
def get_vocabularies_and_iterators(experiment, data_dir=None, max_len=30):
    """
    Creates vocabularies and iterators for the experiment
    :param experiment: the Experiment object including all settings about the experiment
    :param data_dir: the directory where data is stored in. If None, default is applied
    :param max_len: the max length, default is the sentence max length considered during tokenization process
    :return: src vocabulary, trg vocabulary, datasets and iteratotrs + sample iterator if dataset europarl is used
    """

    device = experiment.get_device()

    #### Create torchtext fields
    ####### SRC, TRG
    voc_limit = experiment.voc_limit
    min_freq = experiment.min_freq

    corpus = experiment.corpus
    language_code = experiment.lang_code
    reduce = experiment.reduce
    print("Vocabulary limit:", voc_limit)

    reverse_input = experiment.reverse_input
    print("Source reversed:", reverse_input)

    print("Required samples:")
    print(experiment.train_samples, experiment.val_samples,
          experiment.test_samples)

    PREPRO = False if corpus == "europarl" else True
    MODE = "w"

    src_tokenizer, trg_tokenizer = get_custom_tokenizer(
        "en", mode=MODE, prepro=PREPRO), get_custom_tokenizer(language_code,
                                                              mode=MODE,
                                                              prepro=PREPRO)

    src_vocab = Field(tokenize=lambda s: src_tokenizer.tokenize(s),
                      include_lengths=False,
                      init_token=None,
                      eos_token=None,
                      pad_token=PAD_TOKEN,
                      unk_token=UNK_TOKEN,
                      lower=True)
    trg_vocab = Field(tokenize=lambda s: trg_tokenizer.tokenize(s),
                      include_lengths=False,
                      init_token=SOS_TOKEN,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      unk_token=UNK_TOKEN,
                      lower=True)
    print("Fields created!")

    ####### create splits ##########

    if corpus == "europarl":

        root = os.path.expanduser(DATA_DIR_PREPRO)
        if not data_dir:
            data_dir = os.path.join(root, corpus, language_code, "splits",
                                    str(max_len))  # local directory

        # check if files have been preprocessed
        try:
            files = os.listdir(data_dir)
            if len(files) < 8:
                print(
                    "ERROR: Not enough training files found at {}!\nTraining the model on the Europarl dataset requires train, val, test and samples splits for each language!"
                    .format(data_dir))
                print(
                    "Please drerun the script 'preprocess.py' for the given <lang_code>!"
                )
        except FileNotFoundError:
            print("ERROR: Training files not found at {}!".format(data_dir))
            print(
                "Please run the 'preprocess.py' script for the given <lang_code> before training the model!"
            )
            exit(-1)

        print("Loading data...")
        start = time.time()
        file_type = experiment.tok
        exts = ("." + experiment.get_src_lang(),
                "." + experiment.get_trg_lang())
        train, val, test = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab),
                                                 exts=exts,
                                                 train="train." + file_type,
                                                 validation="val." + file_type,
                                                 test="test." + file_type,
                                                 path=data_dir,
                                                 reduce=reduce,
                                                 truncate=experiment.truncate)

        ### samples is used to check translations during the training phase
        samples = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab),
                                        exts=exts,
                                        train="samples." + file_type,
                                        validation="",
                                        test="",
                                        path=data_dir)
        end = time.time()
        print("Duration: {}".format(convert_time_unit(end - start)))
        print("Total number of sentences: {}".format(
            (len(train) + len(val) + len(test))))

    else:
        #### Training on IWSLT torchtext corpus #####
        print("Loading data...")
        start = time.time()
        path = os.path.expanduser(os.path.join(DATA_DIR_PREPRO, "iwslt"))
        os.makedirs(path, exist_ok=True)
        exts = (".en", ".de") if experiment.get_src_lang() == "en" else (".de",
                                                                         ".en")
        ## see: https://lukemelas.github.io/machine-translation.html
        train, val, test = datasets.IWSLT.splits(
            root=path,
            exts=exts,
            fields=(src_vocab, trg_vocab),
            filter_pred=lambda x: max(len(vars(x)['src']), len(vars(x)['trg'])
                                      ) <= experiment.truncate)

        samples = None
        end = time.time()
        print("Duration: {}".format(convert_time_unit(end - start)))
        print("Total number of sentences: {}".format(
            (len(train) + len(val) + len(test))))

    if voc_limit > 0:
        src_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit)
        trg_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit)
        print("Vocabularies created!")
    else:
        src_vocab.build_vocab(train, min_freq=min_freq)
        trg_vocab.build_vocab(train, min_freq=min_freq)
        print("Vocabularies created!")

    #### Iterators #####
    # Create iterators to process text in batches of approx. the same length
    train_iter = data.BucketIterator(train,
                                     batch_size=experiment.batch_size,
                                     device=device,
                                     repeat=False,
                                     sort_key=lambda x:
                                     (len(x.src), len(x.trg)),
                                     shuffle=True)
    val_iter = data.BucketIterator(val,
                                   1,
                                   device=device,
                                   repeat=False,
                                   sort_key=lambda x: (len(x.src)),
                                   shuffle=True)
    test_iter = data.Iterator(test,
                              batch_size=1,
                              device=device,
                              repeat=False,
                              sort_key=lambda x: (len(x.src)),
                              shuffle=False)

    if samples[0].examples:
        samples_iter = data.Iterator(samples[0],
                                     batch_size=1,
                                     device=device,
                                     repeat=False,
                                     shuffle=False,
                                     sort_key=lambda x: (len(x.src)))
    else:
        samples_iter = None

    return src_vocab, trg_vocab, train_iter, val_iter, test_iter, train, val, test, samples, samples_iter
예제 #11
0
with open(args.dic, 'rb') as dic_file:
    dictionary = pickle.load(dic_file)

# Reconstruct the dictionary in torchtext.
counter = Counter({'<unk>': 0, '</s>': 0})
TEXT.vocab = vocab.Vocab(counter, specials=['<unk>', '</s>'])
TEXT.vocab.itos = dictionary.idx2word
TEXT.vocab.stoi = defaultdict(vocab._default_unk_index, dictionary.word2idx)

TEXT.vocab.load_vectors('glove.6B.%dd' % args.embedding_dim)
itos = TEXT.vocab.itos if args.p else None
print('Vocab size %d' % len(TEXT.vocab))

train_iter = data.Iterator(dataset=train,
                           batch_size=args.batch_size,
                           sort_key=lambda x: len(x.context),
                           sort=True,
                           repeat=False)
valid_iter = data.Iterator(dataset=valid,
                           batch_size=args.batch_size,
                           sort_key=lambda x: len(x.context),
                           sort=True,
                           repeat=False)

print('Initializing the model')

if args.load_model != '':
    with open(args.load_model, 'rb') as f:
        model = torch.load(f).cuda()
elif args.decider_type == 'cnncontext':
    model = CNNContextClassifier(len(TEXT.vocab),
예제 #12
0
    text_field = data.Field(lower=True, fix_length=40)
    label_field = data.Field(unk_token=None, pad_token=None)
    train_data, valid_data, test_data = load_data(text_field, label_field)

    args.vocab_size = len(text_field.vocab)
    args.target_size = len(label_field.vocab)
    args.weight_matrix = text_field.vocab.vectors
    print(label_field.vocab.itos)
    #print(label_field.vocab.itos)

    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))

    train_iter = data.Iterator(dataset=train_data,
                               batch_size=args.batch_size,
                               shuffle=True)
    valid_iter = data.Iterator(dataset=valid_data,
                               batch_size=args.batch_size,
                               shuffle=False)
    test_iter = data.Iterator(dataset=test_data,
                              batch_size=args.batch_size,
                              shuffle=False)

    if args.train is True:
        print("Start training...")
        esim = model.ESIM(args)
        if args.cuda:
            esim = esim.cuda()
        train.train(train_iter, valid_iter, esim, args)
    else:
예제 #13
0
        path='../data/tweet/multi/top{}/train.csv'.format(emoji_num), format='csv',
        fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True)
valid = data.TabularDataset(
        path='../data/tweet/multi/top{}/valid.csv'.format(emoji_num), format='csv',
        fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True)
test = data.TabularDataset(
        path='../data/tweet/multi/top{}/test.csv'.format(emoji_num), format='csv',
        fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True)

TEXT.build_vocab(train,valid,test, min_freq=5)
print('Building vocabulary Finished.')



train_iter = data.BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Text), device=device, repeat=False)
valid_iter = data.Iterator(dataset=valid, batch_size=batch_size, device=device, shuffle=False, repeat=False)
test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False)


train_dl = datahelper.BatchWrapper(train_iter, ["Text", "Label"])
valid_dl = datahelper.BatchWrapper(valid_iter, ["Text", "Label"])
test_dl = datahelper.BatchWrapper(test_iter, ["Text", "Label"])
print('Reading data done.')


word_matrix = datahelper.wordlist_to_matrix("../data/embedding/top5embedding.txt", TEXT.vocab.itos, device, embedding_dim)

def predict_on(model, data_dl, loss_func, device ,model_state_path=None):
    if model_state_path:
        model.load_state_dict(torch.load(model_state_path))
        print('Start predicting...')
예제 #14
0
파일: cove.py 프로젝트: abishekarun/CLAFF
TEXT.build_vocab(trainds,valds, max_size=100000,vectors=vec)
# build vocab for labels
LABEL.build_vocab(trainds)

outputs_last_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors)
outputs_both_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True)
outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True, residual_embeddings=True)


traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), # specify train and validation Tabulardataset
                                            batch_sizes=(64,len(valid)),  # batch size of train and validation
                                            sort_key=lambda x: len(x.moment), # on what attribute the text should be sorted
                                            device=None, # -1 mean cpu and 0 or None mean gpu
                                            sort_within_batch=True, 
                                            repeat=False)
test_iter = data.Iterator(tst, batch_size=64, device=None, sort=False, sort_within_batch=False, repeat=False)

class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)

train_batch_it = BatchGenerator(traindl, 'moment', 'social')
valid_batch_it = BatchGenerator(valdl, 'moment', 'social')
test_batch_it = BatchGenerator(test_iter,'moment','hmid')
예제 #15
0
dev, test = data.TabularDataset.splits(path=args.output,
                                       validation='valid.txt',
                                       test='test.txt',
                                       format='tsv',
                                       fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)
total_num = len(test)
print('total num of example: {}'.format(total_num))

# load the model
if args.gpu == -1:  # Load all tensors onto the CPU
    test_iter = data.Iterator(test,
                              batch_size=args.batch_size,
                              train=False,
                              repeat=False,
                              sort=False,
                              shuffle=False,
                              sort_within_batch=False)
    model = torch.load(args.dete_model,
                       map_location=lambda storage, loc: storage)
    model.config.cuda = False
else:
    test_iter = data.Iterator(test,
                              batch_size=args.batch_size,
                              device=torch.device('cuda', args.gpu),
                              train=False,
                              repeat=False,
                              sort=False,
                              shuffle=False,
                              sort_within_batch=False)
예제 #16
0
def train(data_path,
          train_path,
          val_path,
          test_path,
          mf,
          epochs,
          bs,
          opt,
          net_type,
          ly,
          hs,
          num_dir,
          emb_dim,
          embfix,
          pretrained_emb,
          dropout,
          pred_filter,
          save_path,
          save,
          verbose=False):
    ############################################################################
    # Load data
    ############################################################################

    embfix = False  # Delete this line later
    pretrained_emb = False  # Delete this line later

    cuda = int(torch.cuda.is_available()) - 1

    TEXT = data.Field(lower=True, init_token="<start>", eos_token="<end>")
    LABELS = data.Field(sequential=False)
    VAR_VALUES_VAL = data.Field(sequential=False)
    VAR_VALUES_TEST = data.Field(sequential=False)
    ANS_VAL = data.Field(sequential=False)
    ANS_TEST = data.Field(sequential=False)
    """
    train, val, test = data.TabularDataset.splits(
        path=data_path, train=train_path,
        validation=val_path, test=test_path, format='tsv',
        fields=[('text', TEXT), ('label', LABELS), ('var_values', VAR_VALUES_VAL), ('ans', ANS)])
    """

    train = data.TabularDataset(path=data_path + train_path,
                                format='tsv',
                                fields=[('text', TEXT), ('label', LABELS),
                                        ('var_values', VAR_VALUES_VAL),
                                        ('ans', ANS_VAL)])

    val = data.TabularDataset(path=data_path + val_path,
                              format='tsv',
                              fields=[('text', TEXT), ('label', LABELS),
                                      ('var_values', VAR_VALUES_VAL),
                                      ('ans', ANS_VAL)])

    test = data.TabularDataset(path=data_path + test_path,
                               format='tsv',
                               fields=[('text', TEXT), ('label', LABELS),
                                       ('var_values', VAR_VALUES_TEST),
                                       ('ans', ANS_TEST)])

    prevecs = None
    if (pretrained_emb == True):
        print('USING PRETRAINED EMB')
        TEXT.build_vocab(train,
                         vectors=GloVe(name='6B', dim=emb_dim),
                         min_freq=mf)
        prevecs = TEXT.vocab.vectors
    else:
        TEXT.build_vocab(train)
    LABELS.build_vocab(train)
    print(len(LABELS.vocab.itos))
    VAR_VALUES_VAL.build_vocab(val)
    VAR_VALUES_TEST.build_vocab(test)
    ANS_VAL.build_vocab(val)
    ANS_TEST.build_vocab(test)

    if not os.path.isdir(save_path): os.makedirs(save_path)
    torch.save(LABELS.vocab.itos, save_path + 'LABELS_vocab_itos.pt')

    snis = [eq.count('[') for eq in LABELS.vocab.itos]
    """
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), batch_sizes=(bs, bs, bs),
        sort_key=lambda x: len(x.text))
    """

    train_iter = data.BucketIterator(train,
                                     batch_size=bs,
                                     sort_key=lambda x: len(x.text),
                                     train=True)
    val_iter = data.Iterator(val,
                             batch_size=bs,
                             repeat=False,
                             train=False,
                             sort=False,
                             shuffle=False)
    test_iter = data.Iterator(test,
                              batch_size=len(test),
                              repeat=False,
                              train=False,
                              sort=False,
                              shuffle=False)

    num_classes = len(LABELS.vocab)
    input_size = len(TEXT.vocab)
    ############################################################################
    # Build the model
    ############################################################################

    model = m.Model(input_size=input_size,
                    hidden_size=hs,
                    num_classes=num_classes,
                    prevecs=prevecs,
                    num_layers=ly,
                    num_dir=num_dir,
                    batch_size=bs,
                    emb_dim=emb_dim,
                    embfix=embfix,
                    dropout=dropout,
                    net_type=net_type)

    criterion = nn.CrossEntropyLoss()

    # Select optimizer
    if (opt == 'adamax'):
        optimizer = torch.optim.Adamax(model.parameters())
    elif (opt == 'adam'):
        optimizer = torch.optim.Adam(model.parameters())
    elif (opt == 'sgd'):
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.5)
    else:
        #print('Optimizer unknown, defaulting to adamax')
        optimizer = torch.optim.Adamax(model.parameters())

    ############################################################################
    # Training the Model
    ############################################################################
    if cuda == 0:
        model = model.cuda()

    hyperparams = {
        'mf': mf,
        'epochs': epochs,
        'bs': bs,
        'opt': opt,
        'net_type': net_type,
        'ly': ly,
        'hs': hs,
        'num_dir': num_dir,
        'emb_dim': emb_dim,
        'embfix': embfix,
        'pretrained_emb': pretrained_emb,
        'dropout': dropout,
        'pred_filter': pred_filter
    }
    print('Training:', hyperparams)
    #print('pretrained_emb:', pretrained_emb)
    #print('embfix:', embfix)
    results = []

    best_true_acc = 0

    for epoch in range(epochs):

        tot_loss = 0
        train_iter.repeat = False
        for batch_count, batch in enumerate(train_iter):
            model.zero_grad()
            inp = batch.text.t()

            preds = model(inp)
            #print(F.softmax(preds))
            loss = criterion(preds, batch.label)
            loss.backward()
            optimizer.step()
            tot_loss += loss.data[0]

        # load correct solver
        solver = None
        if 'tencent' in data_path: solver = tencent_solver
        if 'kushman' in data_path: solver = kushman_solver
        if 'ms_draw' in data_path: solver = msdraw_solver
        if 'mawps' in data_path: solver = mawps_solver

        (avg_loss, accuracy, true_acc, corrects, size, t5_acc, t5_corrects,
         mrr, eval_preds) = evaluate(val_iter,
                                     model,
                                     TEXT,
                                     emb_dim,
                                     LABELS,
                                     VAR_VALUES_VAL,
                                     ANS_VAL,
                                     snis,
                                     pred_filter=pred_filter,
                                     solver=solver)
        print('Classification acc (VAL):', accuracy)
        (_, test_acc, test_true_acc, _, _, _, _, _,
         test_eval_preds) = evaluate(test_iter,
                                     model,
                                     TEXT,
                                     emb_dim,
                                     LABELS,
                                     VAR_VALUES_TEST,
                                     ANS_TEST,
                                     snis,
                                     pred_filter=pred_filter,
                                     solver=solver)

        # save best preds file
        if true_acc > best_true_acc:
            if not os.path.isdir(save_path): os.makedirs(save_path)
            predictions_file = open(save_path + 'predictions.txt', 'w')
            for line in eval_preds:
                predictions_file.write(line + '\n')
            predictions_file.close()

        if save:
            if not os.path.isdir(save_path): os.makedirs(save_path)
            torch.save(model, save_path + '{}_e{}.pt'.format(accuracy, epoch))

        results = np.append(
            results, {
                'epoch': epoch,
                'avg_loss': avg_loss,
                'accuracy': accuracy,
                'true_acc': true_acc,
                'corrects': corrects,
                'size': size,
                't5_acc': t5_acc,
                't5_corrects': t5_corrects,
                'mrr': mrr,
                'preds': eval_preds,
                'test_eval_preds': test_eval_preds,
                'test_true_acc': test_true_acc,
                'test_acc': test_acc
            })
        if verbose:            print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) ' \
            'true_acc: {:.4f}%(todo/todo) t5_acc: {:.4f}%({}/{}) MRR:' \
            '{:.6f}\n'.format(avg_loss, accuracy, corrects, size,
                    t5_acc, t5_corrects, size, mrr))

    #print('Best Accuracy:', np.sort([i['accuracy'] for i in results])[-1])
    #print('Best True Accuracy:', np.sort([i['true_acc'] for i in results])[-1])
    return results
예제 #17
0
def tokenizer(txt):
    return list(jieba.cut(txt))

TEXT = data.Field(sequential=True, tokenize=tokenizer, pad_token='<pad>')
LABEL = data.Field(sequential=False, use_vocab=False)

ftrain = 'train3.tsv'
train = data.TabularDataset(path=os.path.join(DATA,ftrain),format='tsv',
                    fields=[
                        ('seq1',TEXT),
                        ('seq2',TEXT),
                        ('lbl',LABEL)
                    ])
TEXT.build_vocab(train)
train_iter = data.Iterator(train,batch_size=4,sort=False,repeat=False)
# vocab = TEXT.vocab

embedding = torch.nn.Embedding(num_embeddings = len(TEXT.vocab.itos),
                          embedding_dim=10,
                          padding_idx=TEXT.vocab.stoi[TEXT.pad_token]
                          )

for sample in train_iter:
    seq1,seq2,lbl = [getattr(sample, name)
                     for name in ['seq1','seq2','lbl']]
    embedding(seq1.unsqueeze(-1))
    embedding(seq2.unsqueeze(-1))

fvalid = 'train2.tsv'
TEXT2 = data.Field(sequential=True, tokenize=tokenizer, pad_token='<pad>')
예제 #18
0
    path='/content/drive/My Drive/dataset/Cornell-Movie-Quotes-Corpus/', train='train.csv', validation='validation.csv',
    test='test.csv', format='csv', fields=[('src', SRC), ('trg', TRG), ('label_src', LABEL_SRC), ('label_trg', LABEL_TRG)])
"""

SRC.build_vocab(train_ds, vectors=english_fasttext_vectors)
TRG.build_vocab(train_ds, vectors=english_fasttext_vectors)
#SRC.build_vocab(train_ds)
#TRG.build_vocab(train_ds)
print(TRG.vocab.stoi)
print(len(TRG.vocab.stoi))

from torchtext import data

batch_size = 64

train_dl = data.Iterator(train_ds, batch_size=batch_size, train=True)
val_dl = data.Iterator(val_ds, batch_size=batch_size, train=False, sort=False)
batch = next(iter(val_dl))
print(batch.src[0].shape)
print(batch.trg[0].shape)
print(batch.label_src.shape)
print(batch.trg[0][:, 1:])
print(batch.trg[0])

class EncoderRNN(nn.Module):
  def __init__(self, emb_size, hidden_size, vocab_size, text_embedding_vectors, emotion_size, dropout=0):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    if text_embedding_vectors == None:
      self.embedding = nn.Embedding(vocab_size, emb_size)
    else:
예제 #19
0
print('Valid Example: {}'.format('\n'.join([
    '{} ---- {}'.format(example.text, example.label)
    for example in valid_data.examples[:5]
])))
print('Test Example: {}'.format('\n'.join([
    '{} ---- {}'.format(example.text, example.label)
    for example in test_data.examples[:5]
])))

train_iter = data.BucketIterator(dataset=train_data,
                                 batch_size=BATCH_SIZE,
                                 sort_key=lambda x: len(x.text))
valid_iter = data.BucketIterator(dataset=valid_data,
                                 batch_size=BATCH_SIZE,
                                 sort_key=lambda x: len(x.text))
test_iter = data.Iterator(dataset=test_data, batch_size=BATCH_SIZE, sort=False)

# build model
from text_classify.model import RNN, WordAVGModel, TextCNN
from text_classify.transformer import Transformer
embedding_size = TEXT.vocab.vectors.shape[
    1] if USE_PRE_TRAIN_MODEL else EMBEDDING_SIZE

# model = RNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(LABEL.vocab))
# model = TextCNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, output_size=len(LABEL.vocab), pooling_method='avg')
model = WordAVGModel(vocab_size=len(TEXT.vocab),
                     embedding_dim=embedding_size,
                     output_dim=len(LABEL.vocab))
# model = Transformer(input_size=len(TEXT.vocab), d_model=embedding_size, num_head=4, d_ff=HIDDEN_SIZE, output_size=len(LABEL.vocab), pad=TEXT.vocab.stoi['<pad>'], use_mask=True)

utils.weight_init(model)
예제 #20
0
    TEXT, LABEL, train_iter, valid_iter = \
        iters.build_iters_lm(ftrain=opt.ftrain, fvalid=opt.fvalid, bsz=opt.batch_size, level=opt.level)

    ftest = change_file_encoding(opt.ftest)

    test = data.TabularDataset(path=ftest,
                               format='tsv',
                               fields=[
                                   ('index', INDEX),
                                   ('seq1', TEXT),
                                   ('seq2', TEXT),
                               ])

    test_iter = data.Iterator(test,
                              batch_size=opt.batch_size,
                              sort=False,
                              repeat=False)
    location = opt.gpu if torch.cuda.is_available(
    ) and opt.gpu != -1 else 'cpu'
    device = torch.device(location)

    encoder = Encoder(len(TEXT.vocab.stoi), opt.rnn_size,
                      TEXT.vocab.stoi[PAD_WORD], opt.enc_layers, opt.dropout,
                      opt.bidirection)
    model = PhraseSim(encoder, opt.dropout).to(device)
    init_model(opt, model)

    if opt.load_idx != -1:
        basename = "{}-epoch-{}".format(opt.exp, opt.load_idx)
        model_fname = basename + ".model"
        location = {
예제 #21
0
def prepare_data_and_model(Model, args, using_gpu=True):

    if args.test:
        ## # narvi
        #train_path = "/home/zhouy/thesis/data/text_classification_data/train_try.csv"
        #test_path = "/home/zhouy/thesis/data/text_classification_data/test_try.csv"

        # tut thinkstation
        # train_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/train_try.csv"
        # test_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/test_try.csv"

        # # tripadvisor dataset
        # # xps
        test_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\test_try.csv"
        train_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\train_try.csv"

    else:
        # original dataset

        # # narvi
        #train_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_train_dataset.csv"
        #test_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_test_dataset.csv"

        # # tut thinkstation
        # train_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv"
        # test_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv"

        # # xps
        # train_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv"
        # test_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv"

        # tripadvisor dataset
        # xps
        train_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_train_dataset.csv"
        test_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_test_dataset.csv"

    def tokenize(text):
        fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        trans_map = str.maketrans(fileters, " " * len(fileters))
        text = text.translate(trans_map)
        text = [
            tok.text for tok in spacy_en.tokenizer(text) if tok.text != ' '
        ]

        tokenized_text = []
        auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"]
        for token in text:
            if token == "n't":
                tmp = 'not'
            elif token == "'ll":
                tmp = 'will'
            elif token in auxiliary_verbs:
                tmp = 'be'
            else:
                tmp = token
            tokenized_text.append(tmp)
        return tokenized_text

    if args.dataset == 'tripadvisor':

        TEXT = data.Field(tokenize=tokenize,
                          lower=True,
                          batch_first=True,
                          truncate_first=True)
        LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

        test = CustomDataset(test_path,
                             text_field=TEXT,
                             label_field=LABEL,
                             test=True)

        train = CustomDataset(train_path, text_field=TEXT, label_field=LABEL)
        # should save the above train, test, these two variables.

        if args.wordembedding == "glove-6b":
            vectors = GloVe(name='6B', dim=args.embed_dim)
        elif args.wordembedding == "FastText":
            vectors = FastText(language='en')

        else:
            NotImplementedError

        # # FastText
        # vectors = FastText(name='6B', dim=args.embed_dim)

        vectors.unk_init = init.xavier_uniform

        # 下面这行代码报错
        # TEXT.build_vocab(train, vectors=vectors, max_size=30000)

        TEXT.build_vocab(train, vectors=vectors, max_size=10000, min_freq=10)
        LABEL.build_vocab(train)
        print('train.fields', train.fields)
        print('train.name', getattr(train, 'text'))
        print('len(train)', len(train))
        print('vars(train[0])', vars(train[0]))

        # using the training corpus to create the vocabulary

        train_iter = data.Iterator(dataset=train,
                                   batch_size=args.batch_size,
                                   train=True,
                                   repeat=False,
                                   device=0 if using_gpu else -1)
        test_iter = data.Iterator(dataset=test,
                                  batch_size=args.batch_size,
                                  train=False,
                                  sort=False,
                                  device=0 if using_gpu else -1)

        # the number of unique words
        num_tokens = len(TEXT.vocab.itos)
        args.num_tokens = num_tokens

        dev_iter = test_iter

    elif args.dataset == 'SST':
        text_field = data.Field(batch_first=True,
                                lower=True,
                                tokenize=tokenize)
        label_field = data.Field(sequential=False, batch_first=True)

        train_data, dev_data, test_data = datasets.SST.splits(
            text_field, label_field, fine_grained=True)

        vectors = GloVe(name='6B', dim=args.embed_dim)

        text_field.build_vocab(train_data, vectors=vectors, min_freq=1)
        label_field.build_vocab(train_data)

        train_iter = data.Iterator(train_data,
                                   batch_size=args.batch_size,
                                   device=0 if using_gpu else -1,
                                   train=True,
                                   repeat=False,
                                   sort=False,
                                   shuffle=True)
        dev_iter = data.Iterator(dev_data,
                                 batch_size=args.batch_size,
                                 device=0 if using_gpu else -1,
                                 train=False,
                                 repeat=False,
                                 sort=False,
                                 shuffle=False)
        test_iter = data.Iterator(test_data,
                                  batch_size=args.batch_size,
                                  device=0 if using_gpu else -1,
                                  train=False,
                                  repeat=False,
                                  sort=False,
                                  shuffle=False)

        # train_iter, dev_iter, test_iter = sst(text_field, label_field)
        # train_iter, dev_iter, test_iter = SST.iters(batch_size=16, device=0 if using_gpu else -1, vectors="glove.6B.300d")

        # config.target_class = train_iter.dataset.NUM_CLASSES
        args.num_tokens = len(text_field.vocab)
        args.num_classes = len(label_field.vocab) - 1

        print("num_classes: ", args.num_classes)

    if args.model == "VDCNN":
        net = Model(depth=29,
                    vocabulary_size=args.num_tokens,
                    embed_size=16,
                    n_classes=args.num_classes,
                    k=2,
                    optional_shortcut=True)
    else:
        net = Model(args)
    # # copy pretrained glove word embedding into the model
    # net.embedding.weight.data.copy_(TEXT.vocab.vectors)
    if using_gpu:
        net.cuda()

    return train_iter, test_iter, net
예제 #22
0
    def train(self, args):
        ws = self.ws
        records = self.records
        logger = ws.logger('DeepSPEnv.train')

        model = self.sp_model
        model.train()
        optim = torch.optim.Adam(model.parameters())
        train_iter = data.Iterator(records, 1)  # one sequence at a time
        epoch_size = len(train_iter)

        state = self.load_training_state()
        if not args.restart and state:
            self.load_model('int')
            train_iter.load_state_dict(state['train_iter'])
            optim.load_state_dict(state['optim'])
            current_run = state['current_run']
            loss_avg, mae_avg, acc_avg = state['avg']
            start_epoch = train_iter.epoch
            n_samples = state['n_samples']
            initial = train_iter._iterations_this_epoch
        else:
            if not args.restart:
                logger.info('nothing to resume, starting from scratch')
            n_samples = 0  # track total #samples for plotting
            now = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
            current_run = str(self.ws.log_path /
                              ('DeepSPEnv.train/run-%s/' % now))
            loss_avg = []
            mae_avg = []
            acc_avg = []
            start_epoch = 0
            initial = 0

        writer = SummaryWriter(str(current_run))

        for epoch in range(start_epoch, args.n_epochs):
            epoch_iter = iter(tqdm(islice(train_iter, epoch_size - initial),
                                   total=epoch_size,
                                   initial=initial,
                                   desc=f'Epoch {epoch+1:3d}: ',
                                   unit='bz'))

            initial = 0

            try:
                # training
                for batch in critical(epoch_iter):
                    # critical section on one batch
                    i = train_iter._iterations_this_epoch
                    n_samples += len(batch)

                    # backprop on one batch
                    optim.zero_grad()

                    hidden = None
                    losses = []
                    maes = []
                    for q, s in zip(batch.question, batch.score):
                        q_index = q[0].item()
                        if q_index == -1:
                            continue
                        q = self.questions[q_index]
                        q['text'] = torch.tensor(q['text'])
                        q['knowledge'] = torch.tensor(q['knowledge'])
                        q['difficulty'] = torch.tensor([q['difficulty']])
                        s = s.float()
                        s_, hidden = model(q, s, hidden)
                        losses.append(F.mse_loss(s_.view(1), s).view(1))
                        maes.append(F.l1_loss(s_.view(1), s).item())

                    if not losses:
                        continue

                    loss = torch.cat(losses).mean()
                    loss.backward()
                    optim.step()

                    # log loss
                    loss_avg.append(loss.item())
                    mae_avg.extend(maes)
                    acc_avg.extend(np.asarray(maes) < 0.5)
                    if args.log_every == len(loss_avg):
                        writer.add_scalar('DeepSPEnv.train/loss',
                                          np.mean(loss_avg),
                                          n_samples)
                        writer.add_scalar('DeepSPEnv.train/mae',
                                          np.mean(mae_avg), n_samples)
                        writer.add_scalar('DeepSPEnv.train/acc',
                                          np.mean(acc_avg), n_samples)
                        loss_avg = []
                        mae_avg = []
                        acc_avg = []

                    # save model
                    if args.save_every > 0 and i % args.save_every == 0:
                        self.save_model(f'{epoch}.{i}')

                # save after one epoch
                self.save_model(epoch + 1)

            except KeyboardInterrupt:
                self.save_training_state({
                    'current_run': current_run,
                    'optim': optim.state_dict(),
                    'train_iter': train_iter.state_dict(),
                    'n_samples': n_samples,
                    'avg': (loss_avg, mae_avg, acc_avg)
                })
                self.save_model('int')
                raise
예제 #23
0
def load_data(opt):
    # 不设置fix_length
    TEXT = data.Field(sequential=True, fix_length=opt.max_text_len)  # 词或者字符
    LABEL = data.Field(sequential=False, use_vocab=False)

    # load
    # word/ or article/
    train_path = opt.data_path + opt.text_type + '/train_set.csv'
    val_path = opt.data_path + opt.text_type + '/val_set.csv'
    test_path = opt.data_path + opt.text_type + '/test_set.csv'
    train_path = 'D:/git/dataset/val_set.csv'
    test_path = 'D:/git/dataset/val_set.csv'
    val_path = 'D:/git/dataset/val_set.csv'

    # aug for data augmentation
    if opt.aug:
        print('make augmentation datasets!')
    train = GrandDataset(train_path,
                         text_field=TEXT,
                         label_field=LABEL,
                         text_type=opt.text_type,
                         test=False,
                         aug=opt.aug)
    val = GrandDataset(val_path,
                       text_field=TEXT,
                       label_field=LABEL,
                       text_type=opt.text_type,
                       test=False)
    test = GrandDataset(test_path,
                        text_field=TEXT,
                        label_field=None,
                        text_type=opt.text_type,
                        test=True)

    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type,
                                           opt.embedding_dim)
    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform_  # 没有命中的token的初始化方式

    # 构建Vocab
    print('building {} vocabulary......'.format(opt.text_type))
    TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors)
    # LABEL.build_vocab(train)

    # 构建Iterator
    # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
    # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=opt.batch_size,
                                     shuffle=True,
                                     sort_within_batch=False,
                                     repeat=False,
                                     device=opt.device)
    # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False,
    #                                device=opt.device)
    # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device)
    val_iter = data.Iterator(dataset=val,
                             batch_size=opt.batch_size,
                             shuffle=False,
                             sort=False,
                             repeat=False,
                             device=opt.device)
    test_iter = data.Iterator(dataset=test,
                              batch_size=opt.batch_size,
                              shuffle=False,
                              sort=False,
                              repeat=False,
                              device=opt.device)

    return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
예제 #24
0
파일: main.py 프로젝트: cantbesure/BuboQA
labels.build_vocab(train, dev, test)

if os.path.isfile(args.vector_cache):
    questions.vocab.vectors = torch.load(args.vector_cache)
else:
    questions.vocab.load_vectors(wv_dir=args.data_cache,
                                 wv_type=args.word_vectors,
                                 wv_dim=args.d_embed)
    os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True)
    torch.save(questions.vocab.vectors, args.vector_cache)

# get iterators
train_iter = data.Iterator(train,
                           batch_size=args.batch_size,
                           device=args.gpu,
                           train=True,
                           repeat=False,
                           sort=False,
                           shuffle=False)
dev_iter = data.Iterator(dev,
                         batch_size=args.batch_size,
                         device=args.gpu,
                         train=True,
                         repeat=False,
                         sort=False,
                         shuffle=False)
test_iter = data.Iterator(test,
                          batch_size=args.batch_size,
                          device=args.gpu,
                          train=True,
                          repeat=False,
예제 #25
0
    def mkiters(self, train):
        args = self.args
        c = Counter([len(x.out) for x in train])
        t1, t2, t3 = [], [], []
        print("Sorting training data by len")
        for x in train:
            l = len(x.out)
            if l < 100:
                t1.append(x)
            elif l > 100 and l < 220:
                t2.append(x)
            else:
                t3.append(x)
        t1d = data.Dataset(t1, self.fields)
        t2d = data.Dataset(t2, self.fields)
        t3d = data.Dataset(t3, self.fields)
        valid = data.TabularDataset(path=args.path.replace("train", "val"),
                                    format='tsv',
                                    fields=self.fields)
        print("ds sizes:", end='\t')
        for ds in [t1d, t2d, t3d, valid]:
            print(len(ds.examples), end='\t')
            for x in ds:
                x.rawent = x.ent.split(" ; ")
                x.ent = self.vec_ents(x.ent, self.ENT)
                x.rel = self.mkGraphs(x.rel, len(x.ent[1]))
                if args.sparse:
                    x.rel = (self.adjToSparse(x.rel[0]), x.rel[1])
                x.tgt = x.out
                x.out = [
                    y.split("_")[0] + ">" if "_" in y else y for y in x.out
                ]
                x.sordertgt = torch.LongTensor(
                    [int(y) + 3 for y in x.sorder.split(" ")])
                x.sorder = [[
                    int(z) for z in y.strip().split(" ") if len(z) > 0
                ] for y in x.sorder.split("-1")[:-1]]
            ds.fields["tgt"] = self.TGT
            ds.fields["rawent"] = data.RawField()
            ds.fields["sordertgt"] = data.RawField()

        self.t1_iter = data.Iterator(t1d,
                                     args.t1size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.t2_iter = data.Iterator(t2d,
                                     args.t2size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.t3_iter = data.Iterator(t3d,
                                     args.t3size,
                                     device=args.device,
                                     sort_key=lambda x: len(x.out),
                                     repeat=False,
                                     train=True)
        self.val_iter = data.Iterator(valid,
                                      args.t3size,
                                      device=args.device,
                                      sort_key=lambda x: len(x.out),
                                      sort=False,
                                      repeat=False,
                                      train=False)
model.load_state_dict(torch.load('age-3features-model.pt'))
model = model.to(device)

USER_ID = data.Field()
test_data = data.TabularDataset(path=base_dir + "embedding/test_3features.csv",
                                format='csv',
                                skip_header=True,
                                fields=[('user_id', USER_ID),
                                        ('creative_id', creative_id_TEXT),
                                        ('ad_id', ad_id_TEXT),
                                        ('advertiser_id', advertiser_id_TEXT),
                                        ('product_id', product_id_TEXT)])
USER_ID.build_vocab(test_data)
test_iterator = data.Iterator(test_data,
                              batch_size=BATCH_SIZE,
                              sort=False,
                              sort_within_batch=False,
                              device=device)

result_list = []
user_id_list = []
total_predictions = None
model.eval()
with torch.no_grad():
    for i, batch in enumerate(valid_iterator):
        print(i)
        creative_id_text, creative_id_text_length = batch.creative_id
        advertiser_id_text, advertiser_id_text_length = batch.advertiser_id
        ad_id_text, ad_id_text_length = batch.ad_id
        product_id_text, product_id_text_length = batch.product_id
        predictions = model(creative_id_text, creative_id_text_length,
예제 #27
0
def train(args):
    train_data, val_data, test_data, SRC, TGT = prepare_data(args)

    BATCH_SIZE = args.batch_size
    best_bleu_loss = 0
    pad_idx = TGT.vocab.stoi["<pad>"]

    print("Size of source vocabulary:", len(SRC.vocab))
    print("Size of target vocabulary:", len(TGT.vocab))

    print("FC matrix:", args.hidden_dim, args.ff_dim)
    print(args.compress)
    model = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                   d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks,
                                   compress=args.compress, compress_att=args.compress_attn,
                                   compress_mode=args.compress_mode,
                                   num_compress_enc=args.num_enc_blocks_comp,
                                   num_compress_dec=args.num_dec_blocks_comp
                                   )
    model.to(device)
    if args.load_model:
        print('load model from [%s]' % args.load_model, file=sys.stderr)
        params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
        # TODO args = params['args']
        state_dict = params['model']
        # opts = params['']
        model.load_state_dict(state_dict)

    criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    # criterion = nn.NLLLoss(reduction="sum", ignore_index=0)
    criterion.to(device)
    train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, train=True,
                                 sort_within_batch=True,
                                 sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False,
                                 device=device)
    valid_iter = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False, repeat=False,
                           device=device)

    model_opt = opt.WrapperOpt(model.src_embed[0].d_model, 2, 4000,
                                     torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9))

    # train_time = begin_time = time.time()
    valid_params = (SRC, TGT, valid_iter)

    print("Number of examples in train: ", BATCH_SIZE * len([_ for _ in train_iter]))
    print("Number of examples in validation: ", BATCH_SIZE * len([_ for _ in valid_iter]))

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("Number of parameters: ", params)
    if args.debug:
        model2 = transformer.make_model(len(SRC.vocab), len(TGT.vocab),
                                d_model=args.hidden_dim, d_ff=args.ff_dim,
                                N=args.num_blocks, compress=True,compress_att=True,
                                compress_mode=args.compress_mode,
                                num_compress_enc=args.num_enc_blocks_comp,
                                num_compress_dec=args.num_dec_blocks_comp)


        # print("Tranable parameters in fc module ", params2)
        debug_compress_info(model, model2)

        exit()

    os.makedirs(os.path.dirname(args.save_to), exist_ok=True)

    if args.multi_gpu:
        devices = list(np.arange(args.num_devices))
        model_parallel = nn.DataParallel(model, device_ids=devices)

    logger_file = {}#Logger(name=args.exp_name)
    logger_file['bleu'] = []
    logger_file['loss'] = []

    for epoch in range(args.max_epoch):
        print("=" * 80)
        print("Epoch ", epoch + 1)
        print("=" * 80)
        print("Train...")
        if args.multi_gpu:
            model_parallel.train()
            train_loss_fn = MultiGPULossCompute(model.generator, criterion,
                                                      devices=devices, opt=model_opt)
            train_model = model_parallel

        else:
            train_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt)

            model.train()

        _, logger_file = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in train_iter),
                                  model_parallel if args.multi_gpu else model, train_loss_fn,
                                  valid_params=valid_params,
                                  epoch_num=epoch, logger=logger_file)

        if args.multi_gpu:
            model_parallel.eval()
            val_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)
        else:
            model.eval()
            val_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt)

        print("Validation...")
        loss, bleu_loss = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in valid_iter),\
                                        model_parallel if args.multi_gpu else model,
                                        val_loss_fn, valid_params=valid_params, is_valid=True)

        if bleu_loss > best_bleu_loss:
            best_bleu_loss = bleu_loss

            model_state_dict = model.state_dict()
            model_file = args.save_to + args.exp_name + 'valid.bin'
            checkpoint = {
                'model': model_state_dict,
            }

            print('save model without optimizer [%s]' % model_file, file=sys.stderr)

            torch.save(checkpoint, model_file)

        print()
        print("Validation perplexity ", np.exp(loss))

    with open("./logs/"+args.exp_name, 'wb') as f_out:
        pickle.dump(logger_file, f_out)
 def getIter(self, dataset, **kwargs):
     if 'device' not in kwargs:
         kwargs = dict(kwargs, device=self.device)
     else:
         kwargs = dict(kwargs)  # just in case
     return data.Iterator(dataset, **kwargs)
예제 #29
0
    skip_header=True,
    fields=[('Text', TEXT), ('Label', LABEL)])

TEXT.build_vocab(train_data, vectors=vectors)
vocab_size = len(TEXT.vocab)
weight_matrix = TEXT.vocab.vectors

train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=batch_size,
    shuffle=True,
    device=device,
    sort_key=lambda x: len(x.Text))
test_iter = data.Iterator(test_data,
                          batch_size=batch_size,
                          shuffle=False,
                          device=device,
                          sort=False,
                          repeat=False)


def evaluate_accuracy(data_iter, net):
    acc_sum, valid_loss, n = 0.0, 0.0, 0
    valid_batch_num = 0
    net.eval()
    for context in data_iter:
        valid_batch_num += 1
        X = context.Text
        X = X.to(device).long()
        y = context.Label
        y = y.to(device).long()
        y_hat = net(X)
예제 #30
0
def load_data(option):
    #======
    Text_filed = data.Field(sequential=True, fix_length=option.max_text_len)
    Label_field = data.Field(sequential=False, use_vocab=False)

    #======
    train_path = option.data_path + option.text_type + '/train_set.csv'
    val_path = option.data_path + option.text_type + '/val_set.csv'
    test_path = option.data_path + option.text_type + '/test_set.csv'
    if option.aug:
        print('make augementation datasets!')

    train = buildDataset(train_path,
                         text_field=Text_filed,
                         label_field=Label_field,
                         text_type=option.text_type,
                         test=False,
                         aug=option.aug)
    val = buildDataset(val_path,
                       text_field=Text_filed,
                       label_field=Label_field,
                       text_type=option.text_type,
                       test=False)
    test = buildDataset(test_path,
                        text_field=Text_filed,
                        label_field=None,
                        text_type=option.text_type,
                        test=True)

    #======
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}_.txt'.format(option.embedding_path,
                                            option.text_type, option.emb_size)
    print('embedding_path:', embedding_path)  #

    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform_
    #如何指定 Vector 缺失值的初始化方式: vector.unk_init = init.xavier_uniform 这种方式指定完再传入 build_vocab

    #======构建vocab
    print('building {} vocabulary......'.format(option.text_type))
    Text_filed.build_vocab(train,
                           val,
                           test,
                           min_freq=option.min_freq,
                           vectors=vectors)
    print('vocabulary has been made!\n')

    #======构建Iterator
    '''
	1. 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
	2. 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
	'''
    print('building {} Iterator......'.format(option.text_type))
    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=option.batch_size,
                                     shuffle=True,
                                     sort_within_batch=False,
                                     repeat=False,
                                     device=option.device)
    val_iter = data.Iterator(dataset=val,
                             batch_size=option.batch_size,
                             shuffle=False,
                             sort=False,
                             repeat=False,
                             device=option.device)
    test_iter = data.Iterator(dataset=test,
                              batch_size=option.batch_size,
                              shuffle=False,
                              sort=False,
                              repeat=False,
                              device=option.device)
    print('Iterator has been made!\n')

    return train_iter, val_iter, test_iter, len(
        Text_filed.vocab), Text_filed.vocab.vectors