def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
    embeddingsPath = args.embeddings
    #'komninos_english_embeddings.gz'

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    pickleFile = perpareDataset(embeddingsPath, datasets)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    embeddings, mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100],
        'dropout': (0.25, 0.25)
    }
    model = BiLSTM(params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets, data)

    model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5'
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    remove_except_last_model(save_dir, model_init)
示例#2
0
def run(model_path, input_path, config_path):
    """
    This script loads a pretrained model and a input file in CoNLL format (each line a token, words separated by an empty line).
    The input words are passed to the model for tagging. Prints the tokens and the tags in a CoNLL format to stdout
    model_path (string): path to a pretrained model in .h5 format.
    input_path (string): path to the input file in CoNLL format of words to be syllabified.
    """
    words = read_conll_single(
        input_path
    )  # words: list [ { tokens: [ raw_tokens, ... ] } ... ]

    model = BiLSTM.load_model(model_path, config_path)
    data_matrix = create_data_matrix(words, model.mappings)
    tags = model.tagWords(data_matrix)["english"]

    print("\nTagged Words: ")
    for i, word in enumerate(words):
        joined = []
        for j, ch in enumerate(word["tokens"]):
            # pad tags with 0 to length of word.
            if len(tags[i]) < len(word["tokens"]):
                tags[i] += [0] * (len(word["tokens"]) - len(tags[i]))
            joined.append((ch, tags[i][j]))

        for tup in joined:
            print(tup[0], end="")
            if tup[1] == 1:
                print("-", end="")

        print("")
示例#3
0
def eval_single_task(model_path, dataset_id, task, evaluator, embeddings,
                     mappings, data):
    # load the BiLSTM model
    model = BiLSTM.loadModel(model_path)

    # create dataset dictionary
    dataset = Dataset(dataset_id)
    dataset_dict = dataset.to_dict(task)

    # set the model mappings and datasets
    model.setMappings(mappings, embeddings)
    model.setDataset(dataset_dict, data)

    # obtain mapping of indices to POS/NER labels
    label = task + '_BIO' if task == 'NER' else task
    idx2label = model.idx2Labels[label]

    # obtain train and test data
    train_data = data[dataset_id]['trainMatrix']
    test_data = data[dataset_id]['testMatrix']

    # obtain correct and predicted sentences
    corr_idxs = [sentence[label] for sentence in test_data]
    pred_idxs = model.predictLabels(test_data)[label]

    # convert indices to labels (POS tags or NER tags in BIO format)
    corr_labels = [[idx2label[idx] for idx in sent] for sent in corr_idxs]
    pred_labels = [[idx2label[idx] for idx in sent] for sent in pred_idxs]

    evaluator.eval(dataset.name, dataset.lang, task, corr_labels, pred_labels,
                   train_data, test_data)
    print(f'Evaluated single_task - {dataset_id} - {task}')
示例#4
0
 def __init__(self, path):
     print(path)
     self.lstmModel = BiLSTM.loadModel("models/" + path)
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')
示例#5
0
文件: Model.py 项目: webis-de/targer
 def __init__(self, path):
     print("Init Model")
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')
     if (self.lstmModel is None):
         self.lstmModel = BiLSTM.loadModel(path)
示例#6
0
def eval_multi_task(model_path, lang, task, evaluators, embeddings, mappings,
                    data):
    # load the BiLSTM model
    model = BiLSTM.loadModel(model_path)
    print(f'Loaded model {model_path}')

    # obtain the evaluator based on the transfer setting
    if model_path.parent.name == 'single_task':
        transfer_setting = 'out_of_domain'
    elif lang is not None and task is not None:
        transfer_setting = 'cross_domain'
    elif lang is not None and task is None:
        transfer_setting = 'multi_task'
    elif lang is None and task is None:
        transfer_setting = 'cross_lingual'
    else:
        raise ValueError('Unknown transfer setting')

    evaluator = evaluators[transfer_setting]

    # create datasets dictionary
    datasets = Datasets(lang=lang, task=task)
    datasets_dict = datasets.to_dict()

    # set the model mappings and datasets
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets_dict, data)

    # evaluate each dataset separately
    for dataset_id, dataset in datasets:
        # obtain train and test data
        train_data = data[dataset_id]['trainMatrix']
        test_data = data[dataset_id]['testMatrix']

        # predict labels for the POS and NER tasks
        task_predictions = model.predictLabels(test_data)

        # iterate through the available output tasks
        for label in model.labelKeys[dataset_id]:
            # obtain mapping of indices to POS/NER labels
            task = label.replace('_BIO', '')
            idx2label = model.idx2Labels[label]

            # obtain correct and predicted sentences
            corr_idxs = [sentence[label] for sentence in test_data]
            pred_idxs = task_predictions[label]

            # convert indices to labels (POS tags or NER tags in BIO format)
            corr_labels = [[idx2label[idx] for idx in sent]
                           for sent in corr_idxs]
            pred_labels = [[idx2label[idx] for idx in sent]
                           for sent in pred_idxs]

            evaluator.eval(dataset.name, dataset.lang, task, corr_labels,
                           pred_labels, train_data, test_data)
            print(f'Evaluated {transfer_setting} - {dataset_id} - {task}')
def load_model():
    # load the pre-trained Keras model
    global MODEL
    global TRAINING_TAGS

    MODEL = BiLSTM.loadModel("./model/rest_model.h5")
    loaded_model = list(MODEL.models.values())[0]
    loaded_model._make_predict_function()  # This is to avoid troubles with  Flask's  multiple threads
    TRAINING_TAGS = get_model_tags(MODEL)
    logging.info("Loaded NER model...")
示例#8
0
def train_and_eval_model(cfg):
    """
    Load data and train model
    args:
        cfg (YACS YAML config)
    """

    # Data preprocessing
    dataset = {
        "columns": {
            0: "raw_tokens",
            1: "boundaries"
        },
        # CoNLL format (tab-delineated)
        #   Column 0: phones
        #   Column 1: syllable boundary
        "label": "boundaries",  # Which column we like to predict
    }

    # Load the embeddings and the dataset. Choose whether or not to pad the words.
    # Right now, padding must be done if CRF is chosen for output layer.
    # The CRF layer does not support masking.
    embeddings, data, mappings, vocab_size, n_class_labels, word_length = load_dataset(
        dataset, dataset_name=cfg.TRAINING.DATASET, do_pad_words=True)

    create_directory(cfg.CONFIG_NAME)
    logger.info(
        f"Starting training of `{cfg.CONFIG_NAME}` on dataset `{dataset}`")

    for training_repeat in range(cfg.TRAINING.TRAINING_REPEATS):
        model = BiLSTM(cfg)
        model.set_vocab(vocab_size, n_class_labels, word_length, mappings)
        model.set_dataset(dataset, data)

        # Path to store performance scores for dev / test
        model.store_results(PATH + "/" + cfg.CONFIG_NAME + "/" +
                            str(training_repeat) + ".csv")
        model.fit(epochs=cfg.TRAINING.EPOCHS)
def main():
    if len(sys.argv) < 3:
        print("Usage: python RunModel_modified.py modelPath inputPath")
        exit()

    modelPath = sys.argv[1]
    inputPath = sys.argv[2]

    # :: Read input ::
    with open(inputPath, 'r') as f:
        text = f.read()

    # :: Load vocabulary for is_name features ::
    from flashtext import KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    pre_treated_lines, _ = pre_treat_text(text)
    tokenized_sentences = tokenize_text(pre_treated_lines)
    sentences = [{'tokens': sent} for sent in tokenized_sentences]
    addCharInformation(sentences)
    addCasingInformation(sentences)
    addIsNameInformation(sentences, keyword_processor=keyword_processor)
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)

    # :: Output to stdout ::
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']

        for tokenIdx in range(len(tokens)):
            tokenTags = []
            for modelName in sorted(tags.keys()):
                tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

            print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags)))
        print("")
def evaluate(args):
    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    #fpath = 'models/'+args.datasetName+'_1.h5'
    save_dir, model_init = os.path.split(fpath)

    modelPath, _ = get_last_model_path(save_dir, model_init)
    print(modelPath)
    inputPath = args.testFile
    inputColumns = {0: "tokens", 1: 'POS', 2: 'chunk_BIO'}

    resfpath = args.result_save + '/' + args.task + '/' + args.testSetting
    resfile = open(resfpath, 'w')

    # :: Load the model ::
    #lstmModel = ELMoBiLSTM.loadModel(modelPath)

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    sentences = readCoNLL(inputPath, inputColumns)
    addCharInformation(sentences)
    addCasingInformation(sentences)

    # :: Map casing and character information to integer indices ::
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    if (args.task == "pos"):
        # Evaluation of POS tagging
        test_acc = lstmModel.computeAcc(args.datasetName, dataMatrix)
        print("Test-Data: Accuracy: %.4f" % (test_acc))
        resfile.write("Test-Data: Accuracy: %.4f" % (test_acc))
    elif (args.task == "chunking"):
        # Evaluation of Chunking
        test_pre, test_rec, test_f1 = lstmModel.computeF1(
            args.datasetName, dataMatrix)
        print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
              (test_pre, test_rec, test_f1))
        resfile.write("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
                      (test_pre, test_rec, test_f1))

    resfile.close()
示例#11
0
def run_experiment(dataset_id, dataset_dict, lang, task, data):
    # load the pre-trained BiLSTM model
    lang_prefix = f'{lang.lower()}_' if lang is not None else ''
    model = BiLSTM.loadModel(multi_task_models_dir /
                             f'{lang_prefix}datasets.h5')

    # set the single task dataset and select both tasks
    model.setDataset(dataset_dict, data)
    model.tasks = ['POS', 'NER_BIO']

    # path to store the trained model and model results
    experiment_name = f'{dataset_id}_{task.lower()}'
    pretrain_type = 'multi_task' if lang is not None else 'cross_lingual'

    model.modelSavePath = models_dir / f'pretrain_{pretrain_type}/{experiment_name}.h5'
    model.storeResults(results_dir /
                       f'pretrain_{pretrain_type}/{experiment_name}.csv')

    # train the model - no need to build model here
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping
}
# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically ::
embeddingsPath = 'final.txt'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'LSTM',
    'maxCharLength': 30
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
model.fit(epochs=25)
elif model_name == "blstm-crf":
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.25, 0.25),
        'charEmbeddings': 'LSTM',
        'maxCharLength': 50
    }

elif model_name == "cnn-crf":
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.25, 0.25),
        'charEmbeddings': 'CNN',
        'maxCharLength': 50
    }

else:
    print("existing model names are (1) crf, (2) blstm-crf, (3) cnn-crf")
    exit()

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)

#model.storeResults('ler-results.csv') #Path to store performance scores for dev / test

# pickle the model
model.modelSavePath = 'models/blstm-' + model_name + '.h5'
model.fit(epochs=100)
# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
# :: 数据预处理,并保存为cPickle文件
pickleFile = prepareDataset(embeddingsPath, datasets)


############################################################################################################
#
# 2.Network training
#
############################################################################################################
# :: Load the embeddings and the dataset ::
# :: 加载词向量和训练数据 ::
embeddings, mappings, data = loadDatasetPickle(pickleFile)
params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25)}

print("***** Train the model with 1 Epoch and store to disk")
model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/my_model_[Epoch].h5"
model.fit(epochs=1)

print("\n\n\n\n------------------------")
print("***** Load the model and continue training")
newModel = BiLSTM.loadModel('models/my_model_1.h5')
newModel.setDataset(datasets, data)
newModel.modelSavePath = "models/my_reloaded_model_[Epoch].h5"
newModel.fit(epochs=1)
print("***** retrained model store at "+newModel.modelSavePath)
示例#15
0
class ModelIBM:

    lstmModel = BiLSTM.loadModel(modelPath)

    def __init__(self):
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

    def label(self, input):
        #prepare input
        sentences = [{
            'tokens': nltk.word_tokenize(sent)
        } for sent in nltk.sent_tokenize(input)]
        addCharInformation(sentences)
        addCasingInformation(sentences)
        dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True)

        #tag input
        tags = self.lstmModel.tagSentences(dataMatrix)

        #prepare output
        result = []
        for sentenceIdx in range(len(sentences)):
            tokens = sentences[sentenceIdx]['tokens']
            sentence = []
            for tokenIdx in range(len(tokens)):
                tokenTags = []
                currentWord = {}
                for modelName in sorted(tags.keys()):
                    tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

                currentWord['token'] = tokens[tokenIdx]
                currentWord['label'] = tokenTags[0]
                sentence.append(currentWord)
            result.append(sentence)

        return json.dumps(result)

    def label_with_probs(self, input):
        #prepare input
        sentences = [{
            'tokens': nltk.word_tokenize(sent)
        } for sent in nltk.sent_tokenize(input)]
        addCharInformation(sentences)
        addCasingInformation(sentences)
        dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True)

        #tag input
        tags, probs = self.lstmModel.tagSentences_with_probs(dataMatrix)

        #prepare output
        result = []
        for sentenceIdx in range(len(sentences)):
            tokens = sentences[sentenceIdx]['tokens']
            sentence = []
            for tokenIdx in range(len(tokens)):
                tokenTags = []
                probTags = []
                currentWord = {}
                for modelName in sorted(tags.keys()):
                    tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])
                    probTags.append(probs[modelName][sentenceIdx][tokenIdx])

                currentWord['token'] = tokens[tokenIdx]
                currentWord['label'] = tokenTags[0]
                currentWord['prob'] = probTags[0]
                sentence.append(currentWord)
            result.append(sentence)

        return result
示例#16
0
# for subsubkey in data[key][subkey]:
#     print('----',subsubkey)

# print('=========================')

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN',
    'maxCharLength': 50
}

print('#######################' + args.mod + ' #######################')
model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "/datastore/liu121/nosqldb2/emnlp_ukplab/models/[ModelName]_bbn.h5"
eval_result = model.fit(epochs=100)


def report(eval_result, filePath):
    with open(filePath, 'w+') as f:
        for key in eval_result:
            info = eval_result[key]
            f.write('====================' + key + '====================')
            f.write(info['epoch'] + '\n')
            f.write(info["per_f1"] + "\n")
            f.write(info['per_pre'] + '\n')
            f.write(info['per_recall'] + '\n')
示例#17
0
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [500],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'LSTM',
    'maxCharLength': 150,
    'charEmbeddingsSize': 200,
    'charLSTMSize': 200,
    'charFilterLength': 20
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/sentiment_results.csv'
                   )  #Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"  #Path to store models
model.fit(epochs=70)
示例#18
0
# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)


######################################################
#
# The training of the network starts here
#
######################################################


#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.25, 0.25)}


model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/conll2000_chunking.csv') #Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
model.fit(epochs=25)



pickleFile = perpareDataset(embeddingsPath,
                            datasets,
                            reducePretrainedEmbeddings=True)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

if args.tune == 0:
    if args.nb_run == 1:
        model = BiLSTM(params)
        if args.batch_range is not None:
            model.setBatchRangeLength(args.batch_range)
        model.setMappings(mappings, embeddings)
        model.setDataset(datasets, data,
                         mainModelName=args.target_task)  # KHUSUS MULTITSAK

        model.storeResults("/".join([
            args.root_dir_result, args.directory_name, "performance.out"
        ]))  #Path to store performance scores for dev / test
        model.predictionSavePath = "/".join([
            args.root_dir_result, args.directory_name, "predictions",
            "[ModelName]_[Data].conll"
        ])  #Path to store predictions
        model.modelSavePath = "/".join([
            args.root_dir_result, args.directory_name, "models/[ModelName].h5"
#if len(sys.argv) < 4:
#    print("Usage: python RunModel.py modelPath inputPathToConllFile outputPathToConllFile")
#    exit()

#modelPath = sys.argv[1]
#inputPath = sys.argv[2]
#outputPath = sys.argv[3]
inputColumns = {0: "tokens", 1: "gold"}

# :: Prepare the input ::
sentences = readCoNLL(args.input_file, inputColumns)
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Load the model ::
lstmModel = BiLSTM.loadModel(args.model_path)
params = lstmModel.get_params()
#print("params : {}".format(params))

dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

# :: Output to stdout ::
f = None
if args.output_file is not None:
    f = open(args.output_file, "w")

for sentenceIdx in range(len(sentences)):
    tokens = sentences[sentenceIdx]['tokens']
示例#21
0
# embeddingsPath =  'embeddings.vec'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets, useExistent=False)


######################################################
#
# The training of the network starts here
#
######################################################


#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5), 'charEmbeddings':'LSTM',
          'optimizer': 'adam', 'featureNames': ['tokens', 'casing']}


MODEL = BiLSTM(params)
MODEL.setMappings(mappings, embeddings)
MODEL.setDataset(datasets, data)
MODEL.storeResults('results/Jurica_NER.csv') #Path to store performance scores for dev / test
MODEL.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
MODEL.fit(epochs=100)



if len(sys.argv) < 3:
    print(
        "Usage: python3 runModel_singleOutput.py modelPath inputPath outputPath"
    )
    exit()

modelPath = sys.argv[1]
inputPath = sys.argv[2]
outputPath = sys.argv[3]

if not os.path.exists(outputPath):
    os.makedirs(outputPath)

# :: Load the model ::
lstmModel = BiLSTM()
lstmModel.loadModel(modelPath)

for textName in os.listdir(inputPath):
    with open(inputPath + "/" + textName, 'r') as f:
        text = f.read()

# :: Prepare the input ::
    sentences = [{'tokens': nltk.word_tokenize(text)}]
    #addCharInformation(sentences)
    addCasingInformation(sentences)

    dataMatrix = createMatrices(sentences, lstmModel.mappings)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)
示例#23
0
#Parameters of the network
params = {
    'dropout': dropout,
    'classifier': 'CRF',
    'LSTM-Size': layers,
    'optimizer': optimizer,
    'charEmbeddings': charEmbedding,
    'miniBatchSize': 32,
    'detailedOutput': detailedPath
}

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset from the already created pickle file
embeddings, word2Idx, datasets = loadDatasetPickle(pickledData)
data = datasets[datasetName]

model = BiLSTM(params)
model.setMappings(embeddings, data['mappings'])
model.setTrainDataset(data, labelKey)
model.verboseBuild = True
#model.modelSavePath = "models/%s/[DevScore]_[TestScore]_[Epoch].h5" % modelName #Enable this line to save the model to the disk
model.storeResults(resultsPath)
# number is the batch size
model.evaluate(50)
示例#24
0
    (datasetName, dataColumns),
]

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasetFiles)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, word2Idx, datasets = loadDatasetPickle(pickleFile)
data = datasets[datasetName]

print("Dataset:", datasetName)
print(data['mappings'].keys())
print("Label key: ", labelKey)
print("Train Sentences:", len(data['trainMatrix']))
print("Dev Sentences:", len(data['devMatrix']))
print("Test Sentences:", len(data['testMatrix']))

model = BiLSTM(params)
model.setMappings(embeddings, data['mappings'])
model.setTrainDataset(data, labelKey)
model.verboseBuild = True
model.modelSavePath = "models/%s/%s/[DevScore]_[TestScore]_[Epoch].h5" % (
    datasetName, labelKey)  #Enable this line to save the model to the disk
model.evaluate(50)
示例#25
0
def run_experiment(datasets_dict, lang, task, embeddings, mappings, data):
    # set network hyperparameters and mappings/datasets
    model = BiLSTM(network_params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets_dict, data)

    # define the experiment name
    lang_prefix = f'{lang.lower()}_' if lang is not None else ''
    task_suffix = f'_{task.lower()}' if task is not None else ''
    experiment_name = lang_prefix + 'datasets' + task_suffix

    # path to store the trained model and model results
    model.modelSavePath = models_dir / f'{experiment_name}.h5'
    model.storeResults(results_dir / f'{experiment_name}.csv')

    # build and train the model
    model.buildModel()
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping
示例#26
0
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN'
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data,
                 mainModelName='MIT_Restaurant')  # KHUSUS MULTITSAK

model.storeResults("/".join(
    ["results", args.directory_name,
     "performance.out"]))  #Path to store performance scores for dev / test
model.predictionSavePath = "/".join([
    "results", args.directory_name, "predictions",
    "[ModelName]_[Epoch]_[Data].conll"
])  #Path to store predictions
model.modelSavePath = "/".join([
    "results", args.directory_name,
    "models/model_[DevScore]_[TestScore]_[Epoch].h5"
])  #Path to store models
        "Usage: python RunModel_CoNLL_Format.py modelPath inputPathToConllFile"
    )
    exit()

modelPath = sys.argv[1]
inputPath = sys.argv[2]
inputColumns = {0: "tokens", 1: "NER_BIO"}
#inputColumns = {0: "tokens", 1: "is_name", 2: "NER_BIO"}

# :: Prepare the input ::
sentences = readCoNLL(inputPath, inputColumns)
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Load the model ::
lstmModel = BiLSTM.loadModel(modelPath)

dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

# :: Output to stdout ::
all_sentences_preds = []
for sentenceIdx in range(len(sentences)):
    tokens = sentences[sentenceIdx]['tokens']
    correct_tag = sentences[sentenceIdx]['NER_BIO']
    for tokenIdx in range(len(tokens)):
        tokenTags = []
        for modelName in sorted(tags.keys()):
            tokenTags.append(correct_tag[tokenIdx])  # Predicted tag
示例#28
0
}

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'more_embedding.tsv'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

modelPath = sys.argv[1]

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)
# Some network hyperparameters

print("\n\n\n\n------------------------")
print("Load the model and continue training")
newModel = BiLSTM.loadModel(modelPath)
print('load model ' + modelPath)
newModel.setDataset(datasets, data)
newModel.params['earlyStopping'] = 25
newModel.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
newModel.fit(epochs=70)

print("retrained model store at " + newModel.modelSavePath)
# TODO Replace customClassifier dengan main task + auxiliary task
custom_classifier = {}
custom_classifier[target_task] = [('LSTM', 100), 'CRF']
for task in aux_task:
    custom_classifier[task] = ['CRF']

params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN',
    'customClassifier': custom_classifier
}

model = BiLSTM(params)

model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults("/".join(
    [args.root_dir_result, args.directory_name,
     "performance.out"]))  # Path to store performance scores for dev / test
model.predictionSavePath = "/".join([
    args.root_dir_result, args.directory_name, "predictions",
    "[ModelName]_[Data].conll"
])  # Path to store predictions
model.modelSavePath = "/".join(
    [args.root_dir_result, args.directory_name,
     "models/[ModelName].h5"])  # Path to store models
model.fit(epochs=args.nb_epoch)
示例#30
0
from __future__ import print_function
import os
import logging
import sys
from neuralnets.BiLSTM import BiLSTM
from util.preprocessing import perpareDataset, loadDatasetPickle

# :: Change into the working dir of the script ::
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

# :: Logging level ::
loggingLevel = logging.INFO
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

bilstm = BiLSTM.loadModel('results_emnlp/SCRATCH_2_4/models/ATIS.h5')
print(bilstm.models['ATIS'].summary())