Пример #1
0
    def label_with_probs(self, input):
        #prepare input
        sentences = [{
            'tokens': nltk.word_tokenize(sent)
        } for sent in nltk.sent_tokenize(input)]
        addCharInformation(sentences)
        addCasingInformation(sentences)
        dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True)

        #tag input
        tags, probs = self.lstmModel.tagSentences_with_probs(dataMatrix)

        #prepare output
        result = []
        for sentenceIdx in range(len(sentences)):
            tokens = sentences[sentenceIdx]['tokens']
            sentence = []
            for tokenIdx in range(len(tokens)):
                tokenTags = []
                probTags = []
                currentWord = {}
                for modelName in sorted(tags.keys()):
                    tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])
                    probTags.append(probs[modelName][sentenceIdx][tokenIdx])

                currentWord['token'] = tokens[tokenIdx]
                currentWord['label'] = tokenTags[0]
                currentWord['prob'] = probTags[0]
                sentence.append(currentWord)
            result.append(sentence)

        return result
def run_model(file):
    # :: Read input ::
    with open(file, 'r') as f:
        text = f.read()

    # :: Prepare the input ::
    sentences = tokenize(text)

    addCharInformation(sentences)
    addCasingInformation(sentences)
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)

    conll = []
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']

        for tokenIdx in range(len(tokens)):
            tokenTags = []
            for modelName in sorted(tags.keys()):
                tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

            conll.append(
                "%s\t%s\t%s" %
                (sentenceIdx + 1, tokens[tokenIdx], "\t".join(tokenTags)))
        conll.append("")

    conll = "\n".join(conll)

    output_filename = 'system_output/system-' + file.split('/')[-1]
    with open(output_filename, 'w') as outfile:
        outfile.write(conll)
Пример #3
0
def prepare_input(text):
    text = text.strip()
    pre_treated_lines, _ = pre_treat_text(text)
    tokenized_sentences = tokenize_text(pre_treated_lines)
    sentences = [{'tokens': sent} for sent in tokenized_sentences]
    addCharInformation(sentences)
    addCasingInformation(sentences)
    addIsNameInformation(sentences, keyword_processor=KEYWORD_PROCESSOR)
    data_matrix = createMatrices(sentences, MODEL.mappings, True)
    return data_matrix, sentences
Пример #4
0
def evaluate(args):
    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    #fpath = 'models/'+args.datasetName+'_1.h5'
    save_dir, model_init = os.path.split(fpath)

    modelPath, _ = get_last_model_path(save_dir, model_init)
    print(modelPath)
    inputPath = args.testFile
    inputColumns = {0: "tokens", 1: 'POS', 2: 'chunk_BIO'}

    resfpath = args.result_save + '/' + args.task + '/' + args.testSetting
    resfile = open(resfpath, 'w')

    # :: Load the model ::
    lstmModel = ELMoBiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    sentences = readCoNLL(inputPath, inputColumns)
    addCharInformation(sentences)
    addCasingInformation(sentences)

    # :: Map casing and character information to integer indices ::
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Perform the word embedding / ELMo embedding lookup ::
    embLookup = lstmModel.embeddingsLookup
    embLookup.elmo_cuda_device = 0  #Cuda device for pytorch - elmo embedding, -1 for CPU
    addEmbeddings(dataMatrix, embLookup.sentenceLookup)

    if (args.task == "pos"):
        # Evaluation of POS tagging
        test_acc = lstmModel.computeAcc(args.datasetName, dataMatrix)
        print("Test-Data: Accuracy: %.4f" % (test_acc))
        resfile.write("Test-Data: Accuracy: %.4f" % (test_acc))
    elif (args.task == "chunking"):
        # Evaluation of Chunking
        test_pre, test_rec, test_f1 = lstmModel.computeF1(
            args.datasetName, dataMatrix)
        print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
              (test_pre, test_rec, test_f1))
        resfile.write("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
                      (test_pre, test_rec, test_f1))

    resfile.close()
def main():
    if len(sys.argv) < 3:
        print("Usage: python RunModel_modified.py modelPath inputPath")
        exit()

    modelPath = sys.argv[1]
    inputPath = sys.argv[2]

    # :: Read input ::
    with open(inputPath, 'r') as f:
        text = f.read()

    # :: Load vocabulary for is_name features ::
    from flashtext import KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    pre_treated_lines, _ = pre_treat_text(text)
    tokenized_sentences = tokenize_text(pre_treated_lines)
    sentences = [{'tokens': sent} for sent in tokenized_sentences]
    addCharInformation(sentences)
    addCasingInformation(sentences)
    addIsNameInformation(sentences, keyword_processor=keyword_processor)
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)

    # :: Output to stdout ::
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']

        for tokenIdx in range(len(tokens)):
            tokenTags = []
            for modelName in sorted(tags.keys()):
                tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

            print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags)))
        print("")
if len(sys.argv) < 3:
    print(
        "Usage: python RunModel_CoNLL_Format.py modelPath inputPathToConllFile"
    )
    exit()

modelPath = sys.argv[1]
inputPath = sys.argv[2]
inputColumns = {0: "tokens", 1: "NER_BIO"}
#inputColumns = {0: "tokens", 1: "is_name", 2: "NER_BIO"}

# :: Prepare the input ::
sentences = readCoNLL(inputPath, inputColumns)
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Load the model ::
lstmModel = BiLSTM.loadModel(modelPath)

dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

# :: Output to stdout ::
all_sentences_preds = []
for sentenceIdx in range(len(sentences)):
    tokens = sentences[sentenceIdx]['tokens']
    correct_tag = sentences[sentenceIdx]['NER_BIO']
    for tokenIdx in range(len(tokens)):