def train_pos(args): ###################################################### # # Data preprocessing # ###################################################### datasets = { args.datasetName: #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information 'label': 'POS', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = args.embeddings #'komninos_english_embeddings.gz' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25) } model = BiLSTM(params) model.setMappings(mappings, embeddings) model.setDataset(datasets, data) model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5' model.fit(epochs=25) fpath = args.model_save + '/' + args.datasetName + '_1.h5' save_dir, model_init = os.path.split(fpath) print(save_dir) print(model_init) remove_except_last_model(save_dir, model_init)
prepare_training_data(datasets) embeddingsPath = 'komninos_english_embeddings.gz' #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/ # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters #params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25),'charEmbeddings': 'CNN', # 'customClassifier': {'unidep_pos': ['Softmax'], 'conll2000_chunking': [('LSTM', 50), 'CRF']}} # TODO Replace customClassifier dengan main task + auxiliary task custom_classifier = {} custom_classifier[target_task] = [('LSTM', 100), 'CRF'] for task in aux_task: custom_classifier[task] = ['CRF'] params = { 'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25),
datasetFiles = [ (datasetName, dataColumns), ] # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasetFiles) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, word2Idx, datasets = loadDatasetPickle(pickleFile) data = datasets[datasetName] print("Dataset:", datasetName) print(data['mappings'].keys()) print("Label key: ", labelKey) print("Train Sentences:", len(data['trainMatrix'])) print("Dev Sentences:", len(data['devMatrix'])) print("Test Sentences:", len(data['testMatrix'])) model = BiLSTM(params) model.setMappings(embeddings, data['mappings']) model.setTrainDataset(data, labelKey) model.verboseBuild = True model.modelSavePath = "models/%s/%s/[DevScore]_[TestScore]_[Epoch].h5" % ( datasetName, labelKey) #Enable this line to save the model to the disk
model_name = model_path.stem # obtain dataset ID and task from model name dataset_id, task = model_name.rsplit('_', 1) task = task.upper() # obtain dataset language from dataset ID lang = dataset_id.split('_')[0] lang = lang.upper() if lang not in loaded_datasets: # select fasttext word embeddings embeddings_path = embeddings_dir / f'{lang.lower()}.fasttext.oov.vec.gz' # load and cache the embeddings, mappings and datasets loaded_datasets[lang] = loadDatasetPickle(embeddings_path, lang) # unpack the embeddings, mappings and datasets embeddings, mappings, data = loaded_datasets[lang] # evaluate model in a separate process so that memory is released at the end proc_args = (model_path, dataset_id, task, evaluator, embeddings, mappings, data) proc = Process(target=eval_single_task, args=proc_args) proc.start() proc.join() # write the evaluation tables evaluator.write_tables(tables_dir / 'single_task')
# build and train the model model.buildModel() model.fit( epochs=500) # do not limit training by epochs - use early stopping for lang in ['PT', 'ES', None]: # select fasttext word embeddings lang_prefix = lang.lower() if lang is not None else 'es2pt' embeddings_path = embeddings_dir / f'{lang_prefix}.fasttext.oov.vec.gz' # prepare the datasets to be used with the LSTM network prepareDatasets(embeddings_path, lang) # load the embeddings and the datasets embeddings, mappings, data = loadDatasetPickle(embeddings_path, lang) # iterate through the multiple dataset combinations of language and task for task in ['POS', 'NER', None]: if lang is None and task is not None: continue # obtain datasets for the experiment datasets = Datasets(exclude=['pt_colonia'], lang=lang, task=task) datasets_dict = datasets.to_dict() # run experiment in a separate process so that memory is released at the end proc_args = (datasets_dict, lang, task, embeddings, mappings, data) proc = Process(target=run_experiment, args=proc_args) proc.start() proc.join() logger.info(f'Completed experiment: lang {"all" if lang is None else lang} - ' \
def train_pos(args): ###################################################### # # Data preprocessing # ###################################################### datasets = { args.datasetName: #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding 'label': 'POS', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = None elmo_options_file = args.elmo_options elmo_weight_file = args.elmo_weights elmo_mode = 'weighted_average' #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_cuda_device = args.cuda_device #Which GPU to use. -1 for CPU embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example. embLookup.loadCache(args.pkl_path) pickleFile = perpareDataset(datasets, embLookup) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5) } model = ELMoBiLSTM(embLookup, params) model.setMappings(mappings) model.setDataset(datasets, data) model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5" model.fit(epochs=25) fpath = args.model_save + '/' + args.datasetName + '_1.h5' save_dir, model_init = os.path.split(fpath) print(save_dir) print(model_init) # remove trained files except from the last file remove_except_last_model(save_dir, model_init)
# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically :: embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile_train, pickleFile_dev, pickleFile_test = perpareDataset( embeddingsPath, datasets, args.k_shot) print('data prepare successful: %s, %s, and %s' % (pickleFile_train, pickleFile_dev, pickleFile_test)) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset embeddings, mappings, data_train = loadDatasetPickle(pickleFile_train) embeddings, mappings, data_dev = loadDatasetPickle(pickleFile_dev) embeddings, mappings, data_test = loadDatasetPickle(pickleFile_test) # print('mappings type: ',type(mappings)) # for key in mappings: # print(key) # print(mappings[key]) # print('===============') # print('embeddings type:',type(data)) # for key in data: # print(key) # for subkey in data[key]: # print('--',subkey) # for subsubkey in data[key][subkey]: