Python perpareDataset示例，util.preprocessing.perpareDataset Python示例

示例#1

0

显示文件

文件： Train_POS.py 项目： gozdesahin/emnlp2017-bilstm-cnn-crf

def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
    embeddingsPath = args.embeddings
    #'komninos_english_embeddings.gz'

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    pickleFile = perpareDataset(embeddingsPath, datasets)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    embeddings, mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100],
        'dropout': (0.25, 0.25)
    }
    model = BiLSTM(params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets, data)

    model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5'
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    remove_except_last_model(save_dir, model_init)

示例#2

0

显示文件

文件： Train_MultiTask_Different_Levels.py 项目： slouvan/emnlp2017-bilstm-cnn-crf

# Data preprocessing
#
######################################################
datasets = read_dict(args.input_dataset_conf)
print("DATASET CONF {} {}".format(type(datasets), datasets))
target_task = get_target_task(datasets)
print("TARGET TASK {} {}".format(type(target_task), target_task))
aux_task = get_auxiliary_task(datasets)
print("AUX TASK {} {}".format(type(aux_task), aux_task))

prepare_training_data(datasets)

embeddingsPath = 'komninos_english_embeddings.gz'  #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
#params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25),'charEmbeddings': 'CNN',
#          'customClassifier': {'unidep_pos': ['Softmax'], 'conll2000_chunking': [('LSTM', 50), 'CRF']}}

# TODO Replace customClassifier dengan main task + auxiliary task

示例#3

0

显示文件

文件： Train_Chunking.py 项目： nirvana0311/Elmo_experiment

datasets = {
    'conll2000_chunking':  #Name of the dataset
    {
        'columns': {
            0: 'tokens',
            1: 'POS',
            2: 'chunk_BIO'
        },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
        'label': 'chunk_BIO',  #Which column we like to predict
        'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
}

# :: Transform datasets to a pickle file ::
pickleFile = perpareDataset(datasets)

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
embeddings_file = 'embeddings/komninos_english_embeddings.gz'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'weighted_average'

#Which GPU to use for ELMo. -1 for CPU
if torch.cuda.is_available():
    elmo_cuda_device = 0
else:
    elmo_cuda_device = -1

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)

示例#4

0

显示文件

}

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::

embeddings_file = 'embeddings/bioEmbeddings.txt'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'average'
elmo_cuda_device = 0  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
embLookup.loadCache('embeddings/elmo_cache_deid.pkl')

pickleFile = perpareDataset(datasets, embLookup)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.5, 0.5)

示例#5

0

显示文件

# for k in k_shot:
#     datasets[args.mod+'__'+k]=seed
if args.k_shot != '16.0' and args.k_shot != '1.0' and args.k_shot != '2.0' and args.k_shot != '4.0' and args.k_shot != '8.0':
    print('k_shot doesn\'t exist')
    exit()

datasets[args.mod + '__' + args.k_shot] = seed

# datasets={}
# datasets[args.mod+'__'+args.shot]=seeds[args.mod]

# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically ::
embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets, args.k_shot)
print('data prepare successful: %s' % pickleFile)
######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)
# print('mappings type: ',type(mappings))
# for key in mappings:
#     print(key)
#     print(mappings[key])
#     print('===============')

示例#6

0

显示文件

# USER ACTION NEEDED
# specify name of small data set (target data!)
dataSmallName = "Stab201X"

dataSmallColumns = dataSmallName + 'TARGET_BIO'
dataSetFiles = [(dataSmallName,{0:'tokens', 1:dataSmallColumns})]

# USER ACTION NEEDED
# put embeddings here (GloVe and [Komninos & Mandhar 2016])
embeddingsPath = dname + "/embeddings"

# USER ACTION NEEDED
# make sure that dataPath contains the small train,dev,test files for dataName and
# the full size train,dev,test for all other datasets (as created by splitsFullData.py)
# if needed, change path name
dataPath = dname + "/data_multiTask"

for dataAllName in os.listdir(dataPath):
    if dataAllName != dataSmallName:
        dataAllColumns = dataAllName + '_BIO'
        dataSetFiles.append((dataAllName,{0:'tokens', 1:dataAllColumns}))
        print(dataSetFiles)

for embeddingsName in os.listdir(embeddingsPath):
    if embeddingsName == "glove.txt" or embeddingsName == "wiki_extvec":
        embeddingsFull = embeddingsPath + "/" + embeddingsName
        if os.path.isfile(embeddingsFull):
            print(embeddingsName)
            pickleFile = perpareDataset(embeddingsFull, dataSetFiles)

示例#7

0

显示文件

    'model_conseil_doctrine':                                   #Name of the dataset
         {'columns': {0:'tokens', 1:'NER_BIO'},
         # {'columns': {0:'tokens', 1:"is_name",  2:'NER_BIO'},   #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
         'label': 'NER_BIO',                              #Which column we like to predict
         'evaluate': True,                                  #Should we evaluate on this task? Set true always for single task setups
         'commentSymbol': None,
         }
}

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = '/home/pavel/code/conseil_detat/anonymisation_software/train/embeddings.vec'
embeddingsPath = 'jurinet_parsed_100.vec.gz'
# embeddingsPath =  'embeddings.vec'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets, useExistent=False)


######################################################
#
# The training of the network starts here
#
######################################################


#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5), 'charEmbeddings':'LSTM',
          'optimizer': 'adam', 'featureNames': ['tokens', 'casing']}

示例#8

0

显示文件

datasets = read_dict(args.input_dataset_conf)
print("{} {}".format(type(datasets), datasets))

# :: Needed for simulating the low resource scenarios
if args.nb_sentence is not None:
    datasets[list(datasets.keys())[0]]['nb_sentence'] = args.nb_sentence

prepare_training_data(datasets)

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath,
                            datasets,
                            reducePretrainedEmbeddings=True)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = read_dict(args.param_conf)
if args.tune == 0:

示例#9

0

显示文件

文件： subsplitAndPickle_singleTask.py 项目： rxlgq/naacl18-multitask_argument_mining

def pickleData(embeddingsPath, datasetName, dataColumns):
    datasetFiles = [
        (datasetName, dataColumns),
    ]
    pickleFile = perpareDataset(embeddingsPath, datasetFiles)

示例#10

0

显示文件

def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    embeddings_file = None
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights
    elmo_mode = 'weighted_average'
    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    elmo_cuda_device = args.cuda_device  #Which GPU to use. -1 for CPU

    embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                                   elmo_weight_file, elmo_mode,
                                   elmo_cuda_device)
    # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
    embLookup.loadCache(args.pkl_path)

    pickleFile = perpareDataset(datasets, embLookup)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.5, 0.5)
    }

    model = ELMoBiLSTM(embLookup, params)
    model.setMappings(mappings)
    model.setDataset(datasets, data)
    model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5"
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    # remove trained files except from the last file
    remove_except_last_model(save_dir, model_init)

示例#11

0

显示文件

文件： main_uni.py 项目： PengLU1101/SC-LSTM

def main():
    pkl_path = perpareDataset(embeddingsPath, datasets_config)

    data_holder, task2id, id2task, num_feat, num_voc, num_char, tgt_dict, embeddings = Dataloader_elmo1.multitask_dataloader(
        pkl_path, num_task=num_task, batch_size=BATCH_SIZE)
    para = model_para
    task2label = {"conll2000": "chunk", "unidep": "POS", "conll2003": "NER"}
    logger = Logger('./logs/' + str(args.gpu))
    para["id2task"] = id2task
    para["n_feats"] = num_feat
    para["n_vocs"] = num_voc
    para["n_tasks"] = num_task
    para["out_size"] = [
        len(tgt_dict[task2label[id2task[ids]]]) for ids in range(num_task)
    ]
    para["n_chars"] = num_char
    model = Model_s.build_model_cnn(para)
    model.Word_embeddings.apply_weights(embeddings)

    params = list(filter(lambda p: p.requires_grad, model.parameters()))
    num_params = sum(p.numel() for p in model.parameters())
    print(model)

    def lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015):
        lr = init_lr / (1 + decay_rate * epoch)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        return optimizer

    def exp_lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015):
        lr = init_lr * decay_rate**epoch
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        return optimizer

    if args.optim == "adam":
        model_optim = optim_custorm.adam(
            para["d_hid"],
            DenseSparseAdam(params, lr=0.0015, betas=(0.9, 0.98), eps=1e-9))
        args.decay = None
    elif args.optim == "sgd":
        model_optim = optim.SGD(params,
                                lr=0.015,
                                momentum=args.momentum,
                                weight_decay=1e-8)
    if args.mode == "train":
        best_F1 = 0
        if not para["crf"]:
            calculate_loss = nn.NLLLoss()
        else:
            calculate_loss = None
        print("Start training...")
        print('-' * 60)
        KLLoss = None
        start_point = time.time()
        for epoch_idx in range(NUM_EPOCH):

            if args.optim == "sgd":
                if args.decay == "exp":
                    model_optim = exp_lr_decay(model_optim, epoch_idx)
                elif args.decay == "normal":
                    model_optim = lr_decay(model_optim, epoch_idx)
            Pre, Rec, F1, loss_list = run_epoch(model, data_holder,
                                                model_optim, calculate_loss,
                                                KLLoss, para, epoch_idx,
                                                id2task, logger)

            use_time = time.time() - start_point
            print("Time using: %f mins" % (use_time / 60))
            if not best_F1 or best_F1 < F1:
                best_F1 = F1
                Model_s.save_model(model_path, model, para)
                print('*' * 60)
                print(
                    "Save model with average Pre: %f, Rec: %f, F1: %f on dev set."
                    % (Pre, Rec, F1))
                save_idx = epoch_idx
                print('*' * 60)
        print("save model at epoch:", save_idx)
    elif args.mode == "finetune":
        para_path = os.path.join(path, 'para.pkl')
        with open(para_path, "wb") as f:
            para_save = pickle.load(f)
        model = Model_s.build_model(para_save)
        model = Model_s.read_model(model_path, model)

        params = list(filter(lambda p: p.requires_grad, model.parameters()))
        model_optim = optim_custorm.adam(
            para["d_hid"], 1, 800, torch.optim.SGD(params,
                                                   lr=0.0,
                                                   momentum=0.9))

    else:
        para_path = os.path.join(model_path, 'para.pkl')
        with open(para_path, "rb") as f:
            para_save = pickle.load(f)
        model = Model_s.build_model_cnn(para_save)
        model = Model_s.read_model(model_path, model)
        prec_list_test, rec_list_test, f1_list_test, acc_list_test = infer(
            model, data_holder, "test")

示例#12

0

显示文件

# for k in k_shot:
#     datasets[args.mod+'__'+k]=seed
# if args.k_shot!='16.0' and args.k_shot!='1.0' and args.k_shot!='2.0' and args.k_shot!='4.0' and args.k_shot!='8.0':
#     print('k_shot doesn\'t exist')
#     exit()

datasets[args.mod + '__' + args.k_shot] = seed

# datasets={}
# datasets[args.mod+'__'+args.shot]=seeds[args.mod]

# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically ::
embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile_train, pickleFile_dev, pickleFile_test = perpareDataset(
    embeddingsPath, datasets, args.k_shot)
print('data prepare successful: %s, %s, and %s' %
      (pickleFile_train, pickleFile_dev, pickleFile_test))
######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data_train = loadDatasetPickle(pickleFile_train)
embeddings, mappings, data_dev = loadDatasetPickle(pickleFile_dev)
embeddings, mappings, data_test = loadDatasetPickle(pickleFile_test)

# print('mappings type: ',type(mappings))
# for key in mappings: