Пример #1
0
def load_dataset(config, train_pos='train.pos', train_neg='train.neg',
                 dev_pos='dev.pos', dev_neg='dev.neg',
                 test_pos='test.pos', test_neg='test.neg'):

    root = config.data_path
    
    roots = re.split(', +', root)
    if len(roots) > 1:
        logger.info("Combining datasets...")
        files = {'train.pos':[], 'train.neg':[], 'dev.pos':[], \
                 'dev.neg':[], 'test.pos':[], 'test.neg':[]}
        for dir_path in roots:
            for file in files.keys():
                with open(dir_path + file, 'r', encoding='utf8') as f:
                    files[file].extend(f.readlines())
        
        for file, sents in files.items():
            with open('./data/style_transfer/%s' % file, 'w', encoding='utf8') as f:
                for sent in sents:
                    f.write('%s' % sent)
        root = './data/style_transfer/'
    
    TEXT = data.Field(batch_first=True, eos_token='<eos>')
    
    dataset_fn = lambda name: data.TabularDataset(
        path=root + name,
        format='tsv',
        fields=[('text', TEXT)]
    )

    train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
    dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
    test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])

    TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)

    if config.load_pretrained_embed:
        start = time.time()
        
        vectors=torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path)
        TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        print('vectors', TEXT.vocab.vectors.size())
        
        print('load embedding took {:.2f} s.'.format(time.time() - start))

    vocab = TEXT.vocab
        
    dataiter_fn = lambda dataset, train: data.BucketIterator(
        dataset=dataset,
        batch_size=config.batch_size,
        shuffle=train,
        repeat=train,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device=config.device
    )

    train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set])
    dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set])
    test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set])

    train_iters = DatasetIterator(train_pos_iter, train_neg_iter)
    dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter)
    test_iters = DatasetIterator(test_pos_iter, test_neg_iter)
    
    return train_iters, dev_iters, test_iters, vocab
    ('id', id_variable),  # we process this as id field
    ('tweet', text_variable),  # process it as text
    ('subtask_a', None),  # process it as label
    ('encoded_subtask_a', None)
]

test_fields = [
    ('id', id_variable),  # we process this as id field
    ('tweet', text_variable),  # process it as text
    ('subtask_a', None),  # process it as label
    ('encoded_subtask_a', None)
]

# Creating our train and test data
train_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, TRAIN_FILE),
                                 format='tsv',
                                 skip_header=True,
                                 fields=train_fields)

dev_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, DEV_FILE),
                               format='tsv',
                               skip_header=True,
                               fields=dev_fields)

test_data = data.TabularDataset(path=os.path.join(TEMP_DIRECTORY, TEST_FILE),
                                format='tsv',
                                skip_header=True,
                                fields=dev_fields)

vec = vocab.Vectors(DANISH_EMBEDDING_PATH, cache=VECTOR_CACHE)

dev_preds = np.zeros((len(dev_data), N_FOLD))
Пример #3
0
def get_dataset(fix_length=100, lower=False, vectors=None):
    
    if vectors is not None:
        lower=True
        
    LOGGER.debug('Preparing CSV files...')
    # prepare_csv(train, test)


    
    TEXT = data.Field(sequential=True, 
                      lower=True, 
                      include_lengths=True, 
                      batch_first=True, 
                      fix_length=25)
    NUM_FEATURE = data.Field(use_vocab=False,
                       sequential=False,
                       dtype=torch.float16)
    KEYWORD = data.Field(use_vocab=True,
                         sequential=True)

    LOCATION = data.Field(use_vocab=True,
                          sequential=True)


    ID = data.Field(use_vocab=False,
                    sequential=False,
                    dtype=torch.float16)

    # LABEL = data.LabelField(dtype = torch.float)
    LABEL = data.Field(use_vocab=True,
                       sequential=False,
                       dtype=torch.float16)
    
    tv_datafields = [
        ("id", None), # we won't be needing the id, so we pass in None as the field
        ("keyword", None),
        ("location", None),
        ("text", TEXT),
        ("word_count", NUM_FEATURE),
        ("char_count", NUM_FEATURE),
        ("stop_word_count", NUM_FEATURE),
        ("punctuation_count", NUM_FEATURE),
        ("mention_count", NUM_FEATURE),
        ("hashtag_count", NUM_FEATURE),
        ("target", LABEL)]
        


    
    LOGGER.debug('Reading train csv files...')

    train_temp, val_temp = data.TabularDataset.splits(
        path='data/', format='csv', skip_header=True,
        train='train_train.csv', validation='val_val.csv',
        fields=tv_datafields
    )
    
    LOGGER.debug('Reading test csv file...')


    test_temp = data.TabularDataset(
        path='data/prepared_df_test.csv', format='csv',
        skip_header=True,
        fields=tv_datafields[:-1]
    )
    
    LOGGER.debug('Building vocabulary...')

    MAX_VOCAB_SIZE = 25000

    # TODO: проверить, нет ли здесь лика,
    # когда строю словарь по валидационной и тестовой выборках?
    TEXT.build_vocab(
        train_temp, val_temp, test_temp,
        max_size=MAX_VOCAB_SIZE,
        min_freq=10,
        vectors=GloVe(name='6B', dim=300)  # We use it for getting vocabulary of words
    )


    LABEL.build_vocab(train_temp)


    # KEYWORD.build_vocab(
    #     train_temp, val_temp, test_temp,
    #     max_size=MAX_VOCAB_SIZE,
    # )


    # LOCATION.build_vocab(
    #     train_temp, val_temp, test_temp,
    #     max_size=MAX_VOCAB_SIZE,
    # )


    
    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)
    
    train_iter = get_iterator(train_temp, batch_size=32, 
                              train=True, shuffle=True,
                              repeat=False)
    val_iter = get_iterator(val_temp, batch_size=32, 
                            train=True, shuffle=True,
                            repeat=False)
    test_iter = get_iterator(test_temp, batch_size=32, 
                             train=False, shuffle=False,
                             repeat=False)
    
    
    LOGGER.debug('Done preparing the datasets')
    
    return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter
Пример #4
0
    # Prepare data
    text_field = data.Field(
        # tokenize=apply_preprocessing,
        lower=True)
    label_field = data.Field(sequential=False, use_vocab=False, is_target=True)

    print("Creating TabularDatasets for training ({}) and validation ({})...".
          format(SPLIT_RATIO, 1.0 - SPLIT_RATIO))

    trainds, valds = data.TabularDataset(
        path=data_file_path,
        format='csv',
        csv_reader_params={
            'delimiter': '|'
        },
        fields=[
            ('', None),
            # ('Unnamed: 0', None),
            ('anon_id', None),
            ('text', text_field),
            ('label', label_field)
        ],
        skip_header=True).split(split_ratio=SPLIT_RATIO)

    print("Loading vocab from embedding file: {}".format(embedding_file_path))

    # Load/prepare pre-trained embedding vectors (FastText)
    vectors = vocab.Vectors(name=embedding_file_path)
    text_field.build_vocab(trainds, valds, vectors=vectors)

    print("Vocab size: {}".format(len(text_field.vocab)))
Пример #5
0
    def __init__(
        self,
        train_fn,
        batch_size=64,
        valid_ratio=.2,
        device=-1,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True,
    ):
        '''
        DataLoader initialization.
        :param train_fn: Train-set filename
        :param batch_size: Batchify data fot certain batch size.
        :param device: Device-id to load data (-1 for CPU)
        :param max_vocab: Maximum vocabulary size
        :param min_freq: Minimum frequency for loaded word.
        :param use_eos: If it is True, put <EOS> after every end of sentence.
        :param shuffle: If it is True, random shuffle the input data.
        '''
        super().__init__()

        # Define field of the input file.
        # The input file consists of two fields.
        self.label = data.Field(sequential=False,
                                use_vocab=True,
                                unk_token=None)
        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None,
        )

        # Those defined two columns will be delimited by TAB.
        # Thus, we use TabularDataset to load two columns in the input file.
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field.
        train, valid = data.TabularDataset(
            path=train_fn,
            format='tsv',
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=(1 - valid_ratio))

        # Those loaded dataset would be feeded into each iterator:
        # train iterator and valid iterator.
        # We sort input sentences by length, to group similar lengths.
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device='cuda:%d' % device if device >= 0 else 'cpu',
            shuffle=shuffle,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )

        # At last, we make a vocabulary for label and text field.
        # It is making mapping table between words and indice.
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)
Пример #6
0
cudnn.benchmark = True  # fire on all cylinders

# go through rigamaroo to do ..utils.display_results import show_performance
if __package__ is None:
    import sys
    from os import path

    sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
    from utils.display_results import get_performance

# ============================ 20 Newsgroups ============================ #
TEXT_20ng = data.Field(pad_first=True, lower=True, fix_length=100)
LABEL_20ng = data.Field(sequential=False)

train_20ng = data.TabularDataset(path='./.data/20newsgroups/20ng-train.txt',
                                 format='csv',
                                 fields=[('label', LABEL_20ng),
                                         ('text', TEXT_20ng)])

test_20ng = data.TabularDataset(path='./.data/20newsgroups/20ng-test.txt',
                                format='csv',
                                fields=[('label', LABEL_20ng),
                                        ('text', TEXT_20ng)])

TEXT_20ng.build_vocab(train_20ng, max_size=10000)
LABEL_20ng.build_vocab(train_20ng, max_size=10000)
print('vocab length (including special tokens):', len(TEXT_20ng.vocab))

train_iter_20ng = data.BucketIterator(train_20ng,
                                      batch_size=args.batch_size,
                                      repeat=False)
test_iter_20ng = data.BucketIterator(test_20ng,
def main():
    print("Using device: {}" "\n".format(str(device)))

    # Load the training dataset, and create a dataloader to generate a batch.
    textField = data.Field(lower=True,
                           include_lengths=True,
                           batch_first=True,
                           tokenize=student.tokenise,
                           preprocessing=student.preprocessing,
                           postprocessing=student.postprocessing,
                           stop_words=student.stopWords)
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset(
        'train.json', 'json', {
            'reviewText': ('reviewText', textField),
            'rating': ('rating', labelField),
            'businessCategory': ('businessCategory', labelField)
        })

    textField.build_vocab(dataset, vectors=student.wordVectors)
    #print(len(textField.vocab))
    # Allow training on the entire dataset, or split it for training and validation.
    if student.trainValSplit == 1:
        trainLoader = data.BucketIterator(dataset,
                                          shuffle=True,
                                          batch_size=student.batchSize,
                                          sort_key=lambda x: len(x.reviewText),
                                          sort_within_batch=True)
    else:
        train, validate = dataset.split(split_ratio=student.trainValSplit)

        trainLoader, valLoader = data.BucketIterator.splits(
            (train, validate),
            shuffle=True,
            batch_size=student.batchSize,
            sort_key=lambda x: len(x.reviewText),
            sort_within_batch=True)

    # Get model and optimiser from student.
    net = student.net.to(device)
    lossFunc = student.lossFunc
    optimiser = student.optimiser

    # Train.
    for epoch in range(student.epochs):
        runningLoss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs = textField.vocab.vectors[batch.reviewText[0]].to(device)
            length = batch.reviewText[1].to(device)
            rating = batch.rating.to(device)
            businessCategory = batch.businessCategory.to(device)

            # PyTorch calculates gradients by accumulating contributions to them
            # (useful for RNNs).  Hence we must manually set them to zero before
            # calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            ratingOutput, categoryOutput = net(inputs, length)
            loss = lossFunc(ratingOutput, categoryOutput, rating,
                            businessCategory)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            runningLoss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, runningLoss / 32))
                runningLoss = 0

    # Save model.
    torch.save(net.state_dict(), 'savedModel.pth')
    print("\n" "Model saved to savedModel.pth")

    # Test on validation data if it exists.
    if student.trainValSplit != 1:
        net.eval()

        correctRatingOnlySum = 0
        correctCategoryOnlySum = 0
        bothCorrectSum = 0
        with torch.no_grad():
            for batch in valLoader:
                # Get a batch and potentially send it to GPU memory.
                inputs = textField.vocab.vectors[batch.reviewText[0]].to(
                    device)
                length = batch.reviewText[1].to(device)
                rating = batch.rating.to(device)
                businessCategory = batch.businessCategory.to(device)

                # Convert network output to integer values.
                ratingOutputs, categoryOutputs = student.convertNetOutput(
                    *net(inputs, length))

                # Calculate performance
                #print("rating = ", rating)
                #print("rating outputs = ", ratingOutputs)

                #print("category = ", businessCategory)
                #print("category_outputs = ", categoryOutputs)

                correctRating = rating == ratingOutputs.flatten()
                correctCategory = businessCategory == categoryOutputs.flatten()

                correctRatingOnlySum += torch.sum(correctRating
                                                  & ~correctCategory).item()
                correctCategoryOnlySum += torch.sum(correctCategory
                                                    & ~correctRating).item()
                bothCorrectSum += torch.sum(correctRating
                                            & correctCategory).item()

        correctRatingOnlyPercent = correctRatingOnlySum / len(validate)
        correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate)
        bothCorrectPercent = bothCorrectSum / len(validate)
        neitherCorrectPer = 1 - correctRatingOnlyPercent \
                              - correctCategoryOnlyPercent \
                              - bothCorrectPercent

        score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent +
                       0.1 * correctRatingOnlyPercent)

        print("\n"
              "Rating incorrect, business category incorrect: {:.2%}\n"
              "Rating correct, business category incorrect: {:.2%}\n"
              "Rating incorrect, business category correct: {:.2%}\n"
              "Rating correct, business category correct: {:.2%}\n"
              "\n"
              "Weighted score: {:.2f}".format(neitherCorrectPer,
                                              correctRatingOnlyPercent,
                                              correctCategoryOnlyPercent,
                                              bothCorrectPercent, score))
Пример #8
0
def create_dataset(opt, SRC, TRG):
    print("creating dataset and iterator... ")

    if opt.task == 'toy_task' or opt.task == 'e_snli_o':
        # Load in validation data
        f_in, f_out = open(opt.data_path + '/val_in.txt',
                           'r',
                           encoding='utf-8'), open(opt.data_path +
                                                   '/val_out.txt',
                                                   'r',
                                                   encoding='utf-8')
        in_ = [x.replace('\n', '') for x in f_in.readlines()]
        out_ = [x.replace('\n', '') for x in f_out.readlines()]

        raw_data = {'src': in_, 'trg': out_}
        df = pd.DataFrame(raw_data, columns=["src", "trg"])

        mask = (df['src'].str.count(' ') <
                opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
        df = df.loc[mask]

        df.to_csv("translate_transformer_temp.csv", index=False)
        data_fields = [('src', SRC), ('trg', TRG)]
        val = data.TabularDataset('./translate_transformer_temp.csv',
                                  format='csv',
                                  fields=data_fields,
                                  skip_header=True)
        os.remove('translate_transformer_temp.csv')

        val_iter = MyIterator(val,
                              batch_size=opt.batchsize,
                              repeat=False,
                              sort_key=lambda x: (len(x.src), len(x.trg)),
                              train=False,
                              shuffle=False)
    elif opt.task == 'e_snli_r':
        # Load in validation data
        f_in, f_out = open(opt.data_path + '/val_in.txt',
                           'r',
                           encoding='utf-8'), open(opt.data_path +
                                                   '/val_out.txt',
                                                   'r',
                                                   encoding='utf-8')
        if opt.label_path is None:
            raise AssertionError(
                'Need to provide a path to label data for validation checks')

        f_label = open(opt.label_path + '/val_out.txt', 'r', encoding='utf-8')

        in_ = [x.replace('\n', '') for x in f_in.readlines()]
        out_ = [x.replace('\n', '') for x in f_out.readlines()]
        labels_ = [x.replace('\n', '') for x in f_label.readlines()]
        out1, out2, out3 = [], [], []
        for o in out_:
            split = o.split(' @@SEP@@ ')
            out1.append(split[0])
            out2.append(split[1])
            out3.append(split[2])

        raw_data = {
            'src': in_,
            'trg1': out1,
            'trg2': out2,
            'trg3': out3,
            'labels': labels_
        }
        df = pd.DataFrame(raw_data,
                          columns=["src", "trg1", "trg2", "trg3", "labels"])

        mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg1'].str.count(' ') < opt.max_strlen) & \
               (df['trg2'].str.count(' ') < opt.max_strlen) & (df['trg3'].str.count(' ') < opt.max_strlen)
        df = df.loc[mask]

        df.to_csv("translate_transformer_temp.csv", index=False)
        data_fields = [('src', SRC), ('trg1', TRG), ('trg2', TRG),
                       ('trg3', TRG), ('label', opt.classifier_TRG)]
        val = data.TabularDataset('./translate_transformer_temp.csv',
                                  format='csv',
                                  fields=data_fields,
                                  skip_header=True)
        os.remove('translate_transformer_temp.csv')

        val_iter = MyIterator(
            val,
            batch_size=opt.batchsize,
            repeat=False,
            sort_key=lambda x:
            (len(x.src), len(x.trg1), len(x.trg2), len(x.trg3)),
            train=False,
            shuffle=False)

    else:
        # cos_e
        raise NotImplementedError(
            "No implementation provided in process.py for cos-e (yet)")

    ##### TRAIN DATA #####
    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    df = pd.DataFrame(raw_data, columns=["src", "trg"])

    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields,
                                skip_header=True)
    print('desired batch size', opt.batchsize)

    train_iter = MyIterator(
        train,
        batch_size=opt.batchsize,  # device=opt.device,
        repeat=False,
        sort_key=lambda x: (len(x.src), len(x.trg)),
        train=True,
        shuffle=True)
    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print(
                    "weights folder already exists, run program with -load_weights weights to load them"
                )
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)
    print('number of train batches:', opt.train_len)
    print('number of val batches:', get_len(val_iter))
    return train_iter, val_iter
Пример #9
0
def load_data_pair_task(path_file_data,
                        name_file_train,
                        name_file_test=None,
                        device_set="cuda:0",
                        min_freq_word=1,
                        min_freq_char=1,
                        batch_size=2,
                        cache_folder=None,
                        name_vocab=None,
                        path_vocab_pre_built=None):

    inputs_word_query = data.Field(init_token="<bos>",
                                   eos_token="<eos>",
                                   batch_first=True,
                                   include_lengths=True)
    inputs_char_query_nesting = data.Field(tokenize=list,
                                           init_token="<bos>",
                                           eos_token="<eos>",
                                           batch_first=True)
    inputs_char_query = data.NestedField(inputs_char_query_nesting,
                                         init_token="<bos>",
                                         eos_token="<eos>")

    inputs_word_document = data.Field(init_token="<bos>",
                                      eos_token="<eos>",
                                      batch_first=True,
                                      include_lengths=True)
    inputs_char_document_nesting = data.Field(tokenize=list,
                                              init_token="<bos>",
                                              eos_token="<eos>",
                                              batch_first=True)
    inputs_char_document = data.NestedField(inputs_char_document_nesting,
                                            init_token="<bos>",
                                            eos_token="<eos>")

    labels = data.LabelField(sequential=False)

    fields = ([(('inputs_word_query', 'inputs_char_query'),
                (inputs_word_query, inputs_char_query)),
               (('inputs_word_document', 'inputs_char_document'),
                (inputs_word_document, inputs_char_document)),
               ('labels', labels)])

    if name_file_test is not None:
        train, test = data.TabularDataset.splits(path=path_file_data,
                                                 train=name_file_train,
                                                 test=name_file_test,
                                                 fields=tuple(fields),
                                                 format='csv',
                                                 skip_header=False,
                                                 csv_reader_params={
                                                     'delimiter': '\t',
                                                     'quoting': 3
                                                 })

        if path_vocab_pre_built is None:
            if cache_folder is not None and name_vocab is not None:
                inputs_word_document.build_vocab(
                    train.inputs_word_document,
                    test.inputs_word_document,
                    min_freq=min_freq_word,
                    vectors=[MyPretrainedVector(name_vocab, cache_folder)])
            else:
                inputs_word_document.build_vocab(train.inputs_word_document,
                                                 test.inputs_word_document,
                                                 min_freq=min_freq_word)

            inputs_char_document.build_vocab(train.inputs_char_document,
                                             test.inputs_char_document,
                                             min_freq=min_freq_char)

            inputs_word_query.vocab = inputs_word_document.vocab
            inputs_char_query.vocab = inputs_char_query_nesting.vocab = \
                inputs_char_document_nesting.vocab = inputs_char_document.vocab
            labels.build_vocab(train.labels)
        else:
            vocabs = torch.load(path_vocab_pre_built)
            inputs_word_document.vocab = inputs_word_query.vocab = vocabs[0]
            inputs_char_document.vocab = inputs_char_query.vocab = \
                inputs_char_document_nesting.vocab = inputs_char_query_nesting.vocab = vocabs[1]
            labels.vocab = vocabs[2]

        train_iter, test_iter = data.BucketIterator.splits(
            datasets=(train, test),
            batch_size=batch_size,
            shuffle=True,
            sort=False,
            device=torch.device(
                device_set if torch.cuda.is_available() else "cpu"))
        dict_return = {
            'iters': (train_iter, test_iter),
            'vocabs': (inputs_word_document.vocab, inputs_char_document.vocab,
                       labels.vocab)
        }
    else:
        path_file_data_train = os.path.join(path_file_data, name_file_train)
        train = data.TabularDataset(path_file_data_train,
                                    fields=tuple(fields),
                                    format='csv',
                                    skip_header=True,
                                    csv_reader_params={
                                        'delimiter': '\t',
                                        'quoting': 3
                                    })

        if path_vocab_pre_built is None:
            if cache_folder is not None and name_vocab is not None:
                inputs_word_document.build_vocab(
                    train.inputs_word_document,
                    min_freq=min_freq_word,
                    vectors=[MyPretrainedVector(name_vocab, cache_folder)])
            else:
                inputs_word_document.build_vocab(train.inputs_word_document,
                                                 min_freq=min_freq_word)

            inputs_char_document.build_vocab(train.inputs_char_document,
                                             min_freq=min_freq_char)

            inputs_word_query.vocab = inputs_word_document.vocab
            inputs_char_query.vocab = inputs_char_query_nesting.vocab = \
                inputs_char_document_nesting.vocab = inputs_char_document.vocab

            labels.build_vocab(train.labels)

        else:
            vocabs = torch.load(path_vocab_pre_built)
            inputs_word_document.vocab = inputs_word_query.vocab = vocabs[0]
            inputs_char_document.vocab = inputs_char_query.vocab = \
                inputs_char_document_nesting.vocab = inputs_char_query_nesting.vocab = vocabs[1]
            labels.vocab = vocabs[2]

        train_iter = data.BucketIterator(
            train,
            batch_size=batch_size,
            shuffle=True,
            sort=False,
            device=torch.device(
                device_set if torch.cuda.is_available() else "cpu"))

        dict_return = {
            'iters': [train_iter],
            'vocabs': (inputs_word_document.vocab, inputs_char_document.vocab,
                       labels.vocab)
        }

    return dict_return
Пример #10
0
def train():
    INPUTS_DIR = os.getenv(
        'VH_INPUTS_DIR', '/valohai/inputs/'
    )  #,'/stockage/Research_Team_Ressources/valohai_test/')
    dataset_path = get_first_file(os.path.join(INPUTS_DIR, 'dataset'))
    word_vectors_path = get_first_file((os.path.join(INPUTS_DIR,
                                                     'word_vectors')))

    try:
        with open(word_vectors_path, 'rb') as my_pickle:
            TEXT = pickle.load(my_pickle)
    except IOError:
        print("IOError")
        pass

    LABEL = data.Field(sequential=False,
                       preprocessing=custom_preprocess_label,
                       use_vocab=False)
    dataset = data.TabularDataset(path=dataset_path,
                                  format='csv',
                                  fields=[('Num', None), ('Label', LABEL),
                                          ('id', None), ('date', None),
                                          ('flag', None), ('user', None),
                                          ('Text', TEXT)],
                                  skip_header=True)

    nb_train = 1000000
    ratio_train = nb_train / len(dataset)
    nb_test = 500000
    ratio_test = nb_test / len(dataset)
    ratio_other = 1 - ratio_train - ratio_test

    train_dataset, other_dataset, test_dataset = dataset.split(
        split_ratio=[ratio_train, ratio_test, ratio_other])

    train_iter, test_iter = BucketIterator.splits(
        (train_dataset, test_dataset
         ),  # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(FLAGS.batch_size, FLAGS.batch_size),
        device=
        num_device,  # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(
            x.Text
        ),  # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=
        False  # we pass repeat=False because we want to wrap this Iterator layer.
    )

    n_vocab = len(TEXT.vocab)
    model = ConvNet(n_vocab, embed_size=FLAGS.embedding_size,
                    num_classes=2).to(device)
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.learning_rate)
    criterion.to(device)

    num_epoch = FLAGS.epochs
    for epoch in range(num_epoch):
        print("epoch : ", epoch)
        model.train()
        print(run_epoch(train_iter, model, criterion, TEXT, optimizer))
        model.eval()
        print(run_epoch(test_iter, model, criterion, TEXT, None))

    model.eval()
    print(run_epoch(test_iter, model, criterion, TEXT, None))

    # Saving weights and biases as outputs of the task.
    outputs_dir = os.getenv('VH_OUTPUTS_DIR', '/valohai/outputs/')
    filename = os.path.join(outputs_dir, 'model.pth')
    torch.save(model, filename)
    filename_text = os.path.join(outputs_dir, 'text.pickle')
    pickle.dump(TEXT, filename_text)
Пример #11
0
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path,
                     val_2_meta_path, min_freq, batch_size, device, phase, use_categories, 
                     use_subs):
    spacy_en = spacy.load('en')
    print(f'Preparing dataset for {phase}')
    
    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]
    
    CAPTION = data.ReversibleField(
        tokenize='spacy', init_token=start_token, 
        eos_token=end_token, pad_token=pad_token, lower=True, 
        batch_first=True, is_target=True
    )
    INDEX = data.Field(
        sequential=False, use_vocab=False, batch_first=True
    )
    if use_categories:
        # preprocessing: if there is no category replace with -1 (unique number)
        CATEGORY = data.Field(
            sequential=False, use_vocab=False, batch_first=True, 
            preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x)))
        )
        # filter the dataset if the a category is missing (31 -> 41 (count = 1 :()))
        filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31
    else:
        CATEGORY = None
        filter_pred = None
    
    if use_subs:
        SUBS = data.ReversibleField(
            tokenize='spacy', init_token=start_token, 
            eos_token=end_token, pad_token=pad_token, lower=True, 
            batch_first=True
        )
    else:
        SUBS = None
    
    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('category_32', CATEGORY),
        ('subs', SUBS),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=train_meta_path, format='tsv', skip_header=True, fields=fields,
        filter_pred=filter_pred
    )
    CAPTION.build_vocab(dataset.caption, min_freq=min_freq)
    train_vocab = CAPTION.vocab
    
    train_subs_vocab = None
    if use_subs:
        SUBS.build_vocab(dataset.subs, min_freq=min_freq)
        train_subs_vocab = SUBS.vocab
        
    if phase == 'val_1':
        dataset = data.TabularDataset(
            path=val_1_meta_path, format='tsv', skip_header=True, fields=fields,
            filter_pred=filter_pred
        )
    elif phase == 'val_2':
        dataset = data.TabularDataset(
            path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, 
            filter_pred=filter_pred
        )
    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption))
    sort_key = lambda x: 0 #len(x.caption)
    datasetloader = data.BucketIterator(
        dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True
    )
    return train_vocab, train_subs_vocab, datasetloader
Пример #12
0

def tokenizer(text):
    return [token.text for token in nlp.tokenizer(text)]


qid = None
text_field = data.Field(sequential=True, tokenize=tokenizer, lower=True)
target_field = data.Field(sequential=False,
                          use_vocab=False,
                          is_target=True,
                          dtype=torch.long)

df = data.TabularDataset(path=PATH_TO_TRAINING_DATA,
                         format='CSV',
                         fields=[('qid', qid), ('question_text', text_field),
                                 ('target', target_field)],
                         skip_header=True)

df_test = data.TabularDataset(path=PATH_TO_TEST_DATA,
                              format='CSV',
                              fields=[('qid', qid),
                                      ('question_text', text_field)],
                              skip_header=True)

vec = vocab.Vectors(PATH_TO_EMB_FILE)
text_field.build_vocab(df, df_test, vectors=vec)

train, val = df.split(split_ratio=[0.8, 0.2])
train_dl, val_dl = data.Iterator.splits(
    (train, val),
Пример #13
0
def data_loader(opt):
    TEXT = data.Field(sequential=True,
                      tokenize=tokenizer,
                      batch_first=True,
                      pad_first=False,
                      lower=True,
                      include_lengths=False,
                      pad_token='<pad>',
                      fix_length=opt.seq_len
                      )
    LABEL = data.Field(sequential=False, unk_token=None)

    fields = [
        ('APP_ID', None),
        ('LABEL', LABEL),
        ('ACTION', TEXT)]

    # read datasets
    print('reading data ...')
    train = data.TabularDataset(
        path=DirConfig.train_path,
        format='tsv',
        skip_header=True,
        fields=fields)

    dev = data.TabularDataset(
        path=DirConfig.dev_path,
        format='tsv',
        skip_header=True,
        fields=fields)

    test = data.TabularDataset(
        path=DirConfig.test_path,
        format='tsv',
        skip_header=True,
        fields=fields)

    TEXT.build_vocab(train, dev, test)
    LABEL.build_vocab(train)

    print('ACTION:')
    print('\tvocab size:', len(TEXT.vocab))

    print('LABEL:')
    print('\tvocab size:', len(LABEL.vocab))
    print('\t', LABEL.vocab.stoi.items())
    print('\t', LABEL.vocab.itos)

    print('\tDataset:')
    print('\t# Train:', len(train.examples))
    print('\t\tLABEL:', train.examples[0].LABEL)
    print('\t\tACTION:', train.examples[0].ACTION)

    print('\t# Dev:', len(dev.examples))
    print('\t\tLABEL:', dev.examples[0].LABEL)
    print('\t\tACTION:', dev.examples[0].ACTION)

    print('\t# Test:', len(test.examples))
    print('\t\tLABEL:', test.examples[0].LABEL)
    print('\t\tACTION:', test.examples[0].ACTION)

    print('=========================')

    batch = data.BucketIterator.splits(datasets=[train, dev, test],
                                       batch_sizes=[opt.batch_size] * 3,
                                       sort_key=lambda x: len(x.ACTION),
                                       device=opt.device,
                                       sort_within_batch=True,
                                       repeat=False)

    batch = [list(b) for b in batch]

    return batch, TEXT.vocab
Пример #14
0
if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print(
        "Warning: You have Cuda but not use it. You are using CPU for training."
    )

# Set up the data for training
TEXT = data.Field(lower=True)
ED = data.Field()
train = data.TabularDataset(path=os.path.join(args.output, 'dete_train.txt'),
                            format='tsv',
                            fields=[('text', TEXT), ('ed', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None),
         ('obj', None), ('text', TEXT), ('ed', ED)]
dev, test = data.TabularDataset.splits(path=args.output,
                                       validation='valid.txt',
                                       test='test.txt',
                                       format='tsv',
                                       fields=field)
TEXT.build_vocab(train, dev, test)
ED.build_vocab(train, dev)

match_embedding = 0
if os.path.isfile(args.vector_cache):
    stoi, vectors, dim = torch.load(args.vector_cache)
    TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), dim)
Пример #15
0
LABEL = data.LabelField()
SEED = 1234
MAX_VOCAB_SIZE = 25_000
BATCH_SIZE = 64
EMBEDDING_DIM = 50
N_FILTERS = 50
FILTER_SIZES = [1]
DROPOUT = 0.5
N_EPOCHS = 100

torch.manual_seed(SEED)

fields = {'question': ('text', TEXT), 'name': ('label', LABEL)}

train_data = data.TabularDataset(
                            path = os.path.join(os.path.dirname(__file__),'eva_nlp_training.json'),
                            format = 'json',
                            fields = fields)

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.50d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

train_iterator = data.BucketIterator(
Пример #16
0
        ("threat", LABEL),
        ("obscene", LABEL),
        ("insult", LABEL),
        ("identity_hate", LABEL)
    ]
    test_datafields = [
        ("id",
         None),  # we won't be needing the id, so we pass in None as the field
        ("comment_text", TEXT)
    ]
    SEED = 1
    BATCH_SIZE = 64

    data_dir = '/media/feng/storage/Downloads/jigsaw'
    train_data = data.TabularDataset(path=os.path.join(data_dir, 'train.csv'),
                                     format='csv',
                                     skip_header=True,
                                     fields=trainval_datafields)
    # valid_data = data.TabularDataset(path=os.path.join(data_dir, 'train.csv'),
    #                                  format='csv', skip_header=True, fields=tv_datafields)
    train_data, valid_data = train_data.split(split_ratio=0.8,
                                              stratified=False,
                                              strata_field='toxic',
                                              random_state=random.seed(SEED))

    test_data = data.TabularDataset(
        path=os.path.join(data_dir, "test.csv"),  # the file path
        format='csv',
        skip_header=True,
        # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=test_datafields)
Пример #17
0
def convert_text_to_idx(args):

    path_text = os.path.join(cwd, args.data_dir, args.text_file)
    path_out = os.path.join(cwd, args.data_dir, args.idx_file)
    if args.write_data_idx:
        f_out = open(path_out, 'w')

    ## --------------------------------------- ##
    ## -- Tokenize by BERT-- ##
    """
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data = []
    with open(path_text) as f:
        lines = f.read().split('\n')
    aa = lines[2].split('\t')[-1]
    pdb.set_trace()
    data = [tokenizer.tokenize(line) for line in lines]
    pdb.set_trace()
    """

    ## --Tokenize by torchtext-- ##
    """
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=30)
    datafields = [('eid', None),('idxP',None),('idxC',None),('MaxDegree',None),('MaxL',None),('text', TEXT)]
    train_data = data.TabularDataset(path=path_text, format='tsv', skip_header=False, fields=datafields)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    table = build_loopup_table(TEXT.vocab.freqs.most_common(5000))
    """
    ## --Build table by idf-- ##
    tokenize = lambda x: x
    TEXT = data.Field(sequential=True,
                      tokenize=tokenize,
                      lower=True,
                      include_lengths=True,
                      batch_first=True,
                      fix_length=30)

    datafields = [('eid', None), ('idxP', None), ('idxC', None),
                  ('MaxDegree', None), ('MaxL', None), ('interval', None),
                  ('text', TEXT)]
    train_data = data.TabularDataset(path=path_text,
                                     format='tsv',
                                     skip_header=False,
                                     fields=datafields)

    corpus = []
    for dd in train_data:
        dd_text = dd.text
        corpus.append(dd_text)

    vectorizer = TfidfVectorizer(token_pattern=r'\S+')
    X = vectorizer.fit_transform(corpus)
    indices = np.argsort(vectorizer.idf_)  # sort from large to small by IDF
    feature_names = vectorizer.get_feature_names()
    top_n = 5000  # Find the top n words by IDF
    top_features = [feature_names[i] for i in indices[:top_n]]
    table = {}
    idx = 0
    for feature in top_features:
        table[feature] = idx
        idx += 1
    ## ======================================== ##
    #with open(os.path.join(cwd, 'data/{}/dict_0.json'.format(args.data_dir.split('/')[-1])), 'w') as f_dict:
    #    f_dict.write(json.dumps(table)) # Save the vocabulary dictionary
    with open(path_text, 'r') as f:
        raw_lines = f.read().rstrip().split('\n')

    print('Writing idx:count data file')
    cnt = 0
    for line in tqdm(raw_lines):
        text = line.split('\t')[-1]
        idx_count = text_to_idx_count(text, table)
        temp = line.split('\t')[:-1]
        temp.append(idx_count)
        new_line = '\t'.join(temp)
        cnt += 1
        if args.write_data_idx:
            if cnt == len(raw_lines):
                f_out.write('{}'.format(new_line))
            else:
                f_out.write('{}\n'.format(new_line))

    if args.write_data_idx:
        f_out.close()
Пример #18
0
if not args.save_dir.is_dir(): args.save_dir.mkdir()
# creating dated path for saving updated datasets later
if not (args.path/args.now).is_dir(): (args.path/args.now).mkdir()
# creating dataframes
print('\nCreating DataFrames ... \n')
train_df = pd.read_csv(args.path/'train.csv', header=None, names=args.names)
valid_df = pd.read_csv(args.path/'val.csv', header=None, names=args.names)
test_df = pd.read_csv(args.path/'test.csv', header=None, names=args.names)
test_df = helpers.check_batch_size(test_df, len(test_df['text']), args)
# copying validation set to new dated path
print('Copying validation set to time specific folder. \n')
valid_df.to_csv(args.path/args.now/'val.csv', index=False, header=False)


# creating datasets 
train_ds = data.TabularDataset(path=args.path/'train.csv', format='csv', fields=args.datafields)
label_field.build_vocab(train_ds)
args.class_num = len(label_field.vocab) - 1
# creating DataBunch objects for langage modelling and classification
print('\nCreating DataBunch objects...')
data_lm = TextLMDataBunch.from_df(args.path, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=0, label_cols=1)
data_clas = TextClasDataBunch.from_df(args.path, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=0, label_cols=1,
                                      vocab=data_lm.train_ds.vocab, bs=args.bs)

# fine-tuning language model
print('\nFine-tuning language model ...')
helpers.language_model(data_lm, args)
# creating a classifier
print('\nTraining classifier ...')
model = helpers.classifier(data_clas, args)
Пример #19
0
def main():
    # if GPU is availale, use GPU
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Use " + str(device))

    # Load the training dataset, and create a dataloader to generate a batch.自动处理小写,计算长度
    textField = data.Field(
        lower=True,
        include_lengths=True,
        batch_first=True,
        preprocessing=preprocessing,  # 单词形式下的预处理,过去式之类的去除
        postprocessing=postprocessing,
        stop_words=get_stopwords())  # 剔除stopwords中的所有单词
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset('train.csv', 'csv', {
        'text': ('text', textField),
        'target': ('target', labelField)
    })

    textField.build_vocab(
        dataset, vectors=config.wordVectors)  # 把数据转换为向量,用上面定义的textfield

    # 分割数据集,训练集与验证集
    train_dataset, validate_dataset = dataset.split(
        split_ratio=config.proportion_of_val_dataset,
        stratified=True,
        strata_field='target')

    train_loader, val_loader = data.BucketIterator.splits(
        (train_dataset, validate_dataset),
        shuffle=True,
        batch_size=config.batchSize,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True)

    net = get_model(config.dim, config.from_old_model,
                    config.model_path).to(device)

    criterion = config.criterion

    params = net.parameters()
    # create optimizer
    if config.optimizer_name == "SGD":
        optimizer = toptim.SGD(params, lr=config.learning_rate)
    elif config.optimizer_name == "Adam":
        optimizer = toptim.Adam(params, lr=config.learning_rate)
    elif config.optimizer_name == "AdamW":
        optimizer = AdamW(params, lr=config.learning_rate, weight_decay=1e-6)

    # 混合精度加速
    if config.use_apex:
        net, optimizer = amp.initialize(net, optimizer, opt_level="O1")

    train_start = time.time()

    for epoch in range(config.epochs):
        '''
        # change lr by epoch
        adjust_learning_rate(optimizer, epoch)
        '''

        # start train
        train(net, train_loader, config.criterion, optimizer, epoch, device,
              log, textField)

        # start val
        val(net, val_loader, config.criterion, optimizer, epoch, device, log,
            train_start, textField)

    print("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " +
          str(best_val_acc[1]) + ".")
    log.write("Final saved model is epoch " + str(best_val_acc[0]) +
              ", acc: " + str(best_val_acc[1]) + "\n")

    print("Done.")
    log.write("Done.\n")
    def create_datasets(self):
        """
        Load data, build vocabulary and create Iterator objects
        for train, validation and test data.

        Returns:
        - train_iter : Iterator object for train batches of size self.train_batch_size 
          to iterate over.
        - val_iter : Iterator object for val batches of size self.val_batch_size 
          to iterate over.
        - test_iter : Iterator object for test batches of size self.test_batch_size 
          to iterate over.
        """
        if self.seed:
            random.seed(14)

        # Create fields
        tokenizer = lambda x: x.split()
        ID = data.Field()
        TEXT = data.Field(tokenize=tokenizer,
                          init_token='<bos>',
                          eos_token='<eos>',
                          lower=True)
        TARGET = data.LabelField(dtype=torch.float)
        train_fields = [('id', None), ('text', TEXT), ('target', TARGET)]

        # Data
        train_data = data.TabularDataset(path=self.path,
                                         format='csv',
                                         skip_header=True,
                                         fields=train_fields)

        # Split
        train, val, test = train_data.split(split_ratio=[0.6, 0.2, 0.2],
                                            random_state=random.getstate())

        # Vocab
        if self.use_embedding:
            TEXT.build_vocab(train_data,
                             vectors=GloVe(name='6B', dim=300),
                             min_freq=5)
            self.embedding_matrix = TEXT.vocab.vectors
        else:
            TEXT.build_vocab(train_data, min_freq=5)
        TARGET.build_vocab(train_data)

        # Iterators
        train_iter = data.BucketIterator(
            train,
            sort_key=lambda x: len(
                x.text),  # sort sequences by length (dynamic padding)
            batch_size=self.train_batch_size,  # batch size
            device=self.device  # select device (e.g. CPU)
        )

        val_iter = data.BucketIterator(val,
                                       sort_key=lambda x: len(x.text),
                                       batch_size=self.val_batch_size,
                                       device=self.device)

        test_iter = data.Iterator(test,
                                  batch_size=self.test_batch_size,
                                  device=self.device,
                                  train=False,
                                  sort=False,
                                  sort_within_batch=False)

        return train_iter, val_iter, test_iter
Пример #21
0
train.drop_duplicates(subset="text", inplace = True)
SEED = 1234

torch.manual_seed(SEED)

TEXT = data.Field(tokenize = 'spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

train.to_csv("train_formatted.csv", index=False)

fields = [('text',TEXT), ('label',LABEL)]


train = data.TabularDataset(
                                        path = 'train_formatted.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header =True
)

import random

train, valid = train.split(split_ratio=0.9, random_state = random.seed(SEED))
TEXT.build_vocab(train, vectors ="glove.6B.100d") 

LABEL.build_vocab(train)



BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
Пример #22
0
    def loader(self):
        tokenize = lambda x: self.lemmatizer.lemmatize(
            re.sub(r'<.*?>|[^\w\s]|\d+', '', x)).split()

        TEXT = data.Field(sequential=True,
                          tokenize=tokenize,
                          include_lengths=True,
                          batch_first=True,
                          dtype=torch.long)
        PRONOUN = data.Field(sequential=False, batch_first=True)
        P_OFFSET = data.Field(sequential=False, batch_first=True)
        A = data.Field(sequential=False, batch_first=True)
        B = data.Field(sequential=False, batch_first=True)
        A_OFFSET = data.Field(sequential=False, batch_first=True)
        B_OFFSET = data.Field(sequential=False, batch_first=True)
        A_COREF = data.Field(sequential=False, batch_first=True)
        B_COREF = data.Field(sequential=False, batch_first=True)

        NE_LABEL = data.LabelField(
            batch_first=True,
            sequential=False)  #tokenize is removed since default is none

        input_fields = [('ID', None), ('Text', TEXT), ('Pronoun', PRONOUN),
                        ('Pronoun_off', P_OFFSET),
                        ('A', A), ('A_off', A_OFFSET), ('A_coref', A_COREF),
                        ('B', A), ('B_off', B_OFFSET), ('B_coref', B_COREF),
                        ('URL', None)]

        train = data.TabularDataset(path=self.train_path,
                                    format='tsv',
                                    fields=input_fields,
                                    skip_header=True)
        valid = data.TabularDataset(path=self.valid_path,
                                    format='tsv',
                                    fields=input_fields,
                                    skip_header=True)
        test = data.TabularDataset(path=self.test_path,
                                   format='tsv',
                                   fields=input_fields,
                                   skip_header=True)

        ##MAP WORDS & FIGURE OUT THE MAX SIZE FOR BUILDING VOCAB

        TEXT.build_vocab(train,
                         max_size=30000,
                         vectors=GloVe(name='6B', dim=300))  # Glove Embedding
        PRONOUN.build_vocab(train)

        # NE emb
        list_of_A = [x for x in train.A]
        list_of_B = [x for x in train.B]
        AB_concat = list_of_A + list_of_B
        NE_LABEL.build_vocab(AB_concat)

        word_emb = TEXT.vocab.vectors
        #pro_emb = PRONOUN.vocab.vectors
        #NE_emb = NE_LABEL.vocab.vectors
        vocab_size = len(TEXT.vocab)

        # if want to use bucket iterator (batching)
        train_data, valid_data, test_data = data.BucketIterator.splits(
            (train, valid, test),
            batch_size=self.batch_size,
            repeat=False,
            shuffle=True)

        print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
        print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
        print("NE Length: " + str(len(NE_LABEL.vocab)))
        print(
            "\nSize of train set: {} \nSize of validation set: {} \nSize of test set: {}"
            .format(len(train_data.dataset), len(valid_data.dataset),
                    len(test_data.dataset)))

        return TEXT, PRONOUN, NE_LABEL, word_emb, train_data, valid_data, test_data, train, valid, test
Пример #23
0
LABEL_snli.build_vocab(train_snli, max_size=10000)
print('vocab length (including special tokens):', len(TEXT_snli.vocab))

# make iterators
train_iter_snli, val_iter_snli, test_iter_snli = data.BucketIterator.splits(
    (train_snli, val_snli, test_snli),
    batch_size=args.batch_size,
    repeat=False)

# ============================ SNLI ============================ #

# ============================ Multi30K ============================ #
TEXT_m30k = data.Field(pad_first=True, lower=True)

m30k_data = data.TabularDataset(path='./.data/multi30k/train.txt',
                                format='csv',
                                fields=[('text', TEXT_m30k)])

TEXT_m30k.build_vocab(train_sst.text, max_size=10000)
print('vocab length (including special tokens):', len(TEXT_m30k.vocab))

train_iter_m30k = data.BucketIterator(m30k_data,
                                      batch_size=args.batch_size,
                                      repeat=False)
# ============================ Multi30K ============================ #

# ============================ WMT16 ============================ #
TEXT_wmt16 = data.Field(pad_first=True, lower=True)

wmt16_data = data.TabularDataset(path='./.data/wmt16/wmt16_sentences',
                                 format='csv',
Пример #24
0
    logger.info(labels)

    TEXT = ReversibleField(sequential=True, include_lengths=True, lower=False)

    label_size = 42  # 18 if args.dataset != "multi_top_snomed_no_des" else 42

    LABEL = MultiLabelField(sequential=True,
                            use_vocab=False,
                            label_size=label_size,
                            tensor_type=torch.FloatTensor)

    # load in adobe
    if args.abbr:
        adobe_test = data.TabularDataset(
            path=
            '../../data/csu/adobe_abbr_matched_snomed_multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])
    else:
        adobe_test = data.TabularDataset(
            path='../../data/csu/adobe_snomed_multi_label_no_des_test.tsv',
            format='tsv',
            fields=[('Text', TEXT), ('Description', LABEL)])

    if args.dataset == 'multi_top_snomed_no_des':
        train, val, test = data.TabularDataset.splits(
            path='../../data/csu/',
            train='snomed_multi_label_no_des_train.tsv',
            validation='snomed_multi_label_no_des_valid.tsv',
            test='snomed_multi_label_no_des_test.tsv',
            format='tsv',
Пример #25
0
    PATH = '/media/ubuntu/1TO/DTU/courses/DeepLearning/DeepLearning_summarization/saved_network'
glove_dim = 50

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#TODO: How do we enable sort in dataloader?

#%%
"""
Data loader part
"""

TEXT = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True)
LABEL = data.Field(init_token='<bos>', eos_token='<eos>', sequential=True)
train_set = data.TabularDataset(path,
                                'CSV',
                                fields=[('data', TEXT), ('label', LABEL)],
                                skip_header=True)
validation_set = data.TabularDataset(path_val,
                                     'CSV',
                                     fields=[('data', TEXT), ('label', LABEL)],
                                     skip_header=True)

TEXT.build_vocab(train_set,
                 max_size=vocab_size,
                 vectors="glove.6B." + str(glove_dim) + "d")
LABEL.vocab = TEXT.vocab

vocab = TEXT.vocab
#GloVe embedding function
embed = torch.nn.Embedding(len(vocab), glove_dim)
Пример #26
0
def subjective_bot():

    # these are for debugging
    # game_name_list = ['Counter-Strike Global Offensive', 'Transformice', 'Dead Island Epidemic', 'Dota 2', 'Team Fortress 2', 'War Thunder', "Garry's Mod", 'Injustice Gods Among Us Ultimate Edition', 'Loadout', 'Geometry Dash']
    # hour_list = [6.0, 3.0, 2.0, 820.0, 250.0, 50.0, 36.0, 25.0, 14.0, 13.0]
    # # SpeedRunners

    # game_name_list = ['Dota 2','Warframe','The Elder Scrolls V Skyrim','DayZ','DARK SOULS II','Trove','Fallout 4','Starbound','Endless Legend','Warhammer 40,000 Dawn of War II']
    # hour_list = [600.0, 300.0, 200.0, 820.0, 250.0, 500.0, 360.0, 250.0, 54.0, 130.0]
    # # Endless Space

    # game_name_list = ['Dota 2' ,'Counter-Strike Global Offensive' ,'Warhammer 40,000 Dawn of War II - Chaos Rising' ,"NOBUNAGA'S AMBITION Sphere of Influence",'Endless Space','Shadowrun Hong Kong' ,'The Dark Eye Chains of Satinav','Demonicon' ,"Shadowrun Dragonfall - Director's Cut",'Total War SHOGUN 2' ]
    # hour_list = [100,100,100,100,5,20,20,5,5,10]
    # # new: The Elder Scrolls V Skyrim

    # game_name_list= ['Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2','Dota 2']
    # hour_list = [100, 500, 500, 500, 700, 200, 200, 500, 500, 10]

    newplayer = False
    model = torch.load('shuffledmodel_0.52.pt')
    # model.cuda()
    print('Hello There! Welcome to Check This Out!')
    print('Loading Essential Tools...')
    TEXT = data.Field(sequential=True, include_lengths=True, tokenize='spacy')
    LABEL = data.Field(sequential=False, use_vocab=False)
    abstract_data = data.TabularDataset(path='./data/abstract_tsv.tsv',
                                        skip_header=True,
                                        format='tsv',
                                        fields=[('text', TEXT),
                                                ('label', LABEL)])
    TEXT.build_vocab(abstract_data)
    Vocab = TEXT.vocab

    TEXTn = data.Field(sequential=True, include_lengths=True, tokenize='spacy')
    LABELn = data.Field(sequential=False, use_vocab=False)
    abstract_datan = data.TabularDataset(path='./data/full_abstract_tsv.tsv',
                                         skip_header=True,
                                         format='tsv',
                                         fields=[('text', TEXTn),
                                                 ('label', LABELn)])
    TEXTn.build_vocab(abstract_datan)
    Vocabfull = TEXTn.vocab

    glove = torchtext.vocab.GloVe(name='6B', dim=100)
    Vocabfull.load_vectors(glove)
    embeds = nn.Embedding.from_pretrained(Vocabfull.vectors)
    abstract_dictionary = convert_csv_to_dict('./data/abstracts_final.csv')

    game_name_list = []
    hour_list = []
    print('Complete!\n')
    for i in range(10):
        # this is for entering the name of games
        name_true = 0
        while name_true != 1:
            name = input('Please enter NAME of game #{}:'.format(i + 1))
            if name not in abstract_dictionary.keys():
                print('Sorry! The game is not recognized, please try again!')
            else:
                name_true = 1
                game_name_list.append(name)

        # this is for entering the number of hour
        hour_true = 0
        while hour_true == 0:
            hour = input('Enter in HOURS, how much you have played this game:')
            try:
                float(hour)
                hour_list.append(float(hour))
                hour_true = 1
                print('\n')
            except:
                print('Sorry! The input is not valid, please try again!')

    while not newplayer:
        newgamelist = game_name_list[:]
        newhours = hour_list[:]
        name_true = 0
        print('\n')
        while name_true != 1:
            name = input('Please enter NAME of the NEW GAME:')
            # if name == "newplayer!":
            #     newplayer = True
            #     break
            if name not in abstract_dictionary.keys():
                print('Sorry! The game is not recognized, please try again!')
            else:
                name_true = 1
        newgamelist.append(name)
        newhours.append(0)

        #==========================================================#
        # print('\n')
        print('Let us think about it!')
        temp_input = []
        for i in range(11):
            temp = [newgamelist[i], newhours[i]]
            temp_input.append(temp)

        net_cnn = torch.load('cnn_model_epoch0.pkl')
        abstract_list_cnn, hour_list_cnn, label_cnn = convert_data_cnn(
            temp_input, Vocabfull, embeds, abstract_dictionary)
        prediction_cnn = net_cnn.forward(abstract_list_cnn, hour_list_cnn)
        prediction_cnn = prediction_cnn.detach().numpy()
        max_cnn = prediction_cnn.argmax()

        results = ['%.3f' % elem for elem in prediction_cnn.tolist()]

        print('CNN:')
        print(results)
        # print(
        #     'the prediction of the cnn model is:' + str(prediction_cnn[0]) + ', ' + str(prediction_cnn[1]) + ', ' + str(
        #         prediction_cnn[2]) + ', ' + str(prediction_cnn[3]))
        if max_cnn == 0:
            print(
                'I believe the player will be playing this new game for: 0 - 10 hours'
            )
        if max_cnn == 1:
            print(
                'I believe the player will be playing this new game for: 10 - 35 hours'
            )
        if max_cnn == 2:
            print(
                'I believe the player will be playing this new game for: 35 - 85 hours'
            )
        if max_cnn == 3:
            print(
                'I believe the player will be playing this new game for: above 85 hours'
            )


#==========================================================#

        intomodel = []
        for i in range(11):
            intomodel.append(newgamelist[i])
            if newhours[i] < 10:
                intomodel.extend([1, 0, 0, 0])
            elif newhours[i] < 35:
                intomodel.extend([0, 1, 0, 0])
            elif newhours[i] < 85:
                intomodel.extend([0, 0, 1, 0])
            else:
                intomodel.extend([0, 0, 0, 1])
        intomodel = intomodel[:-4]
        nameindex = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        absfeatures = []
        for l in nameindex:
            absfeatures.append(
                sentence_preprocess_rnn(abstract_dictionary[intomodel[l]],
                                        Vocab).cuda())
        for k in range(50):
            if k not in nameindex:
                absfeatures.append(torch.tensor(float(intomodel[k])).cuda())
        predict = model(absfeatures)
        # print(predict)
        results_rnn = ['%.3f' % elem for elem in predict.detach().tolist()]
        predict = predict.argmax()
        #===============================================================#
        print('RNN:')
        print(results_rnn)
        if predict == 0:
            print(
                "Got it! I think you will play this game for less than 10 hours!"
            )
        elif predict == 1:
            print(
                "Got it! I think you will play this game for 10 to 35 hours!")
        elif predict == 2:
            print(
                "Got it! I think you will play this game for 35 to 85 hours!")
        else:
            print(
                "Got it! I think you will play this game for more than 85 hours!"
            )
TEXT = data.Field(include_lengths = True, tokenize='spacy')
LABEL = data.LabelField()
OTHER = data.RawField()
OTHER.is_target = False

devset_fields = {"sentence":("sentence",TEXT), "claim":("claim", TEXT), 
                 "org_sentence":("org_sentence",OTHER), "docid_claimid_sentno":("docid_claimid_sentno",OTHER)}

with open("/content/gdrive/My Drive/TEXT_VOCAB_5EPOCH", "rb") as f:
    TEST_TEXT = dill.load(f)
    print("Text Load Successfull")
with open("/content/gdrive/My Drive/LABEL_VOCAB_5EPOCH", "rb") as f:
    TEST_LABEL = dill.load(f)
    print("Label Load Successfull")

devset = data.TabularDataset(dev_path, format="CSV", fields=devset_fields, skip_header=False)

print(len(devset))
print(vars(devset.examples[0]))

TEXT.build_vocab(devset)

LABEL.build_vocab(devset)

TEXT.vocab = TEST_TEXT.vocab
TEXT.vocab.itos = TEST_TEXT.vocab.itos
TEXT.vocab.stoi = TEST_TEXT.vocab.stoi

LABEL.vocab = TEST_LABEL.vocab
LABEL.vocab.itos = TEST_LABEL.vocab.itos
LABEL.vocab.stoi = TEST_LABEL.vocab.stoi
Пример #28
0
    def __init__(self,
                 path='data',
                 glove_p='glove',
                 train_file='train.csv',
                 valid_file='valid.csv',
                 test_file='test.csv',
                 vocab_file=None,
                 batch_size=32,
                 embed_dim=100,
                 max_vocab_size=None,
                 min_freq=1,
                 max_seq_len=None,
                 gpu=False,
                 use_fasttext=False,
                 padded=False):
        self.batch_size = batch_size
        self.device = 0 if gpu else -1
        self.sort_key = lambda x: len(x.context)
        #print (self.sort_key)

        if not padded:
            self.TEXT = data.Field(lower=True,
                                   pad_token='__pad__',
                                   unk_token='<UNK>',
                                   batch_first=True,
                                   tokenize=clean_str)
        else:
            self.TEXT = data.Field(lower=True,
                                   include_lengths=True,
                                   fix_length=max_seq_len,
                                   unk_token='<UNK>',
                                   batch_first=True,
                                   tokenize=clean_str)

        self.LABEL = data.Field(sequential=False,
                                tensor_type=torch.FloatTensor,
                                unk_token=None,
                                batch_first=True)

        file_format = train_file[-3:]

        # Only take data with max length 160
        # f = lambda ex: len(ex.context) <= max_seq_len and len(ex.response)
        f = None

        self.train = data.TabularDataset(path='{}/{}'.format(path, train_file),
                                         format=file_format,
                                         skip_header=True,
                                         fields=[('context', self.TEXT),
                                                 ('response', self.TEXT),
                                                 ('label', self.LABEL)],
                                         filter_pred=f)

        self.valid, self.test = data.TabularDataset.splits(
            path=path,
            validation=valid_file,
            test=test_file,
            format=file_format,
            skip_header=True,
            fields=[('context', self.TEXT), ('positive', self.TEXT),
                    ('negative_1', self.TEXT), ('negative_2', self.TEXT),
                    ('negative_3', self.TEXT), ('negative_4', self.TEXT),
                    ('negative_5', self.TEXT), ('negative_6', self.TEXT),
                    ('negative_7', self.TEXT), ('negative_8', self.TEXT),
                    ('negative_9', self.TEXT)])

        if vocab_file is None:

            if use_fasttext:
                print("building vocabulary")
                # self.TEXT.build_vocab(
                #     self.train, max_size=max_vocab_size, min_freq=3,
                #     vectors="fasttext.en.300d"
                # )
                self.TEXT.build_vocab(self.train,
                                      max_size=max_vocab_size,
                                      min_freq=5,
                                      vectors="fasttext.en.300d")
            else:
                self.TEXT.build_vocab(self.train,
                                      max_size=max_vocab_size,
                                      min_freq=min_freq,
                                      vectors=GloVe('6B', dim=embed_dim))
            vocab = self.TEXT.vocab

            self.TEXT.build_vocab(self.train,
                                  max_size=max_vocab_size,
                                  min_freq=min_freq,
                                  vectors=GloVe('840B', dim=embed_dim))

        else:
            specials = list(
                OrderedDict.fromkeys(tok for tok in [
                    self.TEXT.unk_token, self.TEXT.pad_token,
                    self.TEXT.init_token, self.TEXT.eos_token
                ] if tok is not None))

            with open(f'{path}/{vocab_file}', 'r') as f:
                counter = Counter(f.read().split('\n'))

            if use_fasttext:
                print("Using fasttext")
                vocab = Vocab(counter,
                              specials=specials,
                              vectors="fasttext.en.300d")
            else:
                vocab = Vocab(counter,
                              specials=specials,
                              vectors=GloVe('6B', dim=embed_dim))

            self.TEXT.vocab = vocab

        self.LABEL.build_vocab(self.train)
        print(vocab.stoi['__pad__'])
        print(vocab.itos[25], vocab.itos[32])
        self.dataset_size = len(self.train.examples)
        self.vocab_size = len(self.TEXT.vocab.itos)
        self.embed_dim = embed_dim
        #self.vectors = self.load_glove_embeddings(glove_p+'/glove.6B.50d.txt', self.TEXT.vocab.stoi)
        self.vectors = self.TEXT.vocab.vectors
Пример #29
0
def main():

    opt = parse_args()

    src_field = data.Field()
    label_field = data.Field(pad_token=None, unk_token=None)
    train = data.TabularDataset(path=opt.train_path,
                                format='tsv',
                                fields=[('text', src_field),
                                        ('label', label_field)])
    test = data.TabularDataset(path=opt.test_path,
                               format='tsv',
                               fields=[('text', src_field),
                                       ('label', label_field)])
    src_field.build_vocab(train,
                          max_size=100000,
                          min_freq=2,
                          vectors="glove.6B.300d")
    label_field.build_vocab(train)

    print("Training size: {0}, Testing size: {1}".format(
        len(train), len(test)))

    classifier = LSTMClassifier(300, 512, len(label_field.vocab),
                                src_field.vocab.vectors)

    if torch.cuda.is_available():
        classifier.cuda()

    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=opt.batch_size,
                                     device=device,
                                     repeat=False)
    test_iter = data.BucketIterator(dataset=test,
                                    batch_size=5,
                                    device=device,
                                    repeat=False)
    for param in classifier.parameters():
        param.data.uniform_(-0.08, 0.08)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters())

    step = 0
    for epoch in range(15):
        test_acc = 0
        for batch in test_iter:
            test_acc += evaluate(classifier, batch)
        print('Test accuracy: {0}'.format(test_acc / len(test)))
        running_loss = 0.0
        for batch in train_iter:
            optimizer.zero_grad()
            pred = classifier(batch.text)
            loss = criterion(pred, batch.label.view(-1))
            running_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            step += 1
            if step % opt.log_every == 0:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, step + 1, running_loss / opt.log_every))
                running_loss = 0.0
        torch.save(classifier, os.path.join("model_{0}".format(epoch + 1)))
Пример #30
0
    def make_vocab(self, args):
        args.path = args.datadir + args.data
        self.INPUT = data.Field(sequential=True,
                                batch_first=True,
                                init_token="<start>",
                                eos_token="<eos>",
                                include_lengths=True)  # Title
        self.OUTPUT = data.Field(
            sequential=True,
            batch_first=True,
            init_token="<start>",
            eos_token="<eos>",
            include_lengths=True)  # Gold Abstract, preprocessed
        self.TARGET = data.Field(sequential=True,
                                 batch_first=True,
                                 init_token="<start>",
                                 eos_token="<eos>")
        self.ENT_TYPE = data.Field(sequential=True,
                                   batch_first=True,
                                   eos_token="<eos>")  # Entity Type
        self.ENT = data.RawField()  # Entity
        self.REL = data.RawField()  # Relation between entities
        self.REL.is_target = False
        self.ENT.is_target = False
        self.fields = [("title", self.INPUT), ("ent", self.ENT),
                       ("nerd", self.ENT_TYPE), ("rel", self.REL),
                       ("out", self.OUTPUT)]
        train = data.TabularDataset(path=args.path,
                                    format='tsv',
                                    fields=self.fields)

        print('Building Vocab... ', end='')

        # Output Vocab
        # mapping from generics to indices are at the last of the vocab
        # also includes indexed generics, (e.g. <method_0>) but in mixed order
        self.OUTPUT.build_vocab(train, min_freq=args.outunk)
        generics = [
            '<method>', '<material>', '<otherscientificterm>', '<metric>',
            '<task>'
        ]  # Entity Types
        self.OUTPUT.vocab.itos.extend(generics)
        for generic in generics:
            self.OUTPUT.vocab.stoi[generic] = self.OUTPUT.vocab.itos.index(
                generic)

        # Target Vocab
        # Same as Output Vocab, except for the indexed generics' indices
        # len(vocab) = 11738 / <method_0>, <material_0> ... : 11738, <method_1>, ... : 11739 and so on.
        self.TARGET.vocab = copy(self.OUTPUT.vocab)
        entity_types = [
            'method', 'material', 'otherscientificterm', 'metric', 'task'
        ]
        for entity_type in entity_types:
            for idx in range(40):
                s = "<" + entity_type + "_" + str(idx) + ">"
                self.TARGET.vocab.stoi[s] = len(self.TARGET.vocab.itos) + idx

        # Entity Type Vocab
        # Indices for not-indexed generics are same with those of output vocab
        self.ENT_TYPE.build_vocab(train, min_freq=0)
        for x in generics:
            self.ENT_TYPE.vocab.stoi[x] = self.OUTPUT.vocab.stoi[x]

        # Title Vocab
        self.INPUT.build_vocab(train, min_freq=args.entunk)

        # Relation Vocab
        # Adds relations.vocab + inverse of relations.vocab
        self.REL.special = ['<pad>', '<unk>', 'ROOT']
        with open(args.datadir + "/" + args.relvocab) as f:
            rel_vocab = [x.strip() for x in f.readlines()]
            self.REL.size = len(rel_vocab)
            rel_vocab += [x + "_inv" for x in rel_vocab]
            rel_vocab += self.REL.special
        self.REL.itos = rel_vocab

        self.ENT.itos, self.ENT.stoi = self.build_ent_vocab(args.path)

        print('Done')

        if not self.args.eval:
            self.make_iterator(train)