Пример #1
0
def get_complex_words(tokenised_string):

    dataframe = pd.DataFrame()
    dataframe['word'] = tokenised_string
    dataframe['binary'] = 'N'
    dataframe.to_csv('./' + 'complex_word' + '.txt',
                     sep='\t',
                     index=False,
                     header=False,
                     quotechar=' ')

    sentences_test = experiment.read_input_files('./complex_word.txt')
    batches_of_sentence_ids = experiment.create_batches_of_sentence_ids(
        sentences_test, config["batch_equal_size"], config['max_batch_size'])

    for sentence_ids_in_batch in batches_of_sentence_ids:
        batch = [sentences_test[i] for i in sentence_ids_in_batch]
        cost, predicted_labels, predicted_probs = model.process_batch(
            batch, is_training=False, learningrate=0.0)
    try:
        assert (len(sentence_ids_in_batch) == len(predicted_labels))
    except:
        print('cw error')

    prob_labels = predicted_probs[0]
    probability_list = []
    for prob_pair in prob_labels:
        probability_list.append(prob_pair[1])

    return probability_list
Пример #2
0
    def get_prob_labels(self):

        try:
            sentences_test = experiment.read_input_files(self.temp_file)

            batches_of_sentence_ids = experiment.create_batches_of_sentence_ids(
                sentences_test, self.config["batch_equal_size"],
                self.config['max_batch_size'])
        except:
            return 'error'
        for sentence_ids_in_batch in batches_of_sentence_ids:
            batch = [sentences_test[i] for i in sentence_ids_in_batch]
            cost, predicted_labels, predicted_probs = self.model.process_batch(
                batch, is_training=False, learningrate=0.0)

        try:
            assert (len(sentence_ids_in_batch) == len(predicted_labels))
        except:
            return 'error'
        prob_labels = predicted_probs[0]
        probability_list = []
        for prob_pair in prob_labels:
            probability_list.append(prob_pair[1])

        return probability_list
    def get_dataframe():

        sentences_test = experiment.read_input_files(temp_file)
        batches_of_sentence_ids = experiment.create_batches_of_sentence_ids(
            sentences_test, config["batch_equal_size"],
            config['max_batch_size'])

        for sentence_ids_in_batch in batches_of_sentence_ids:
            batch = [sentences_test[i] for i in sentence_ids_in_batch]
            cost, predicted_labels, predicted_probs = model.process_batch(
                batch, is_training=False, learningrate=0.0)
        try:
            assert (len(sentence_ids_in_batch) == len(predicted_labels))
        except:
            print('batch size error')

        prob_labels = predicted_probs[0]
        probability_list = []
        for prob_pair in prob_labels:
            probability_list.append(prob_pair[1])

        annotated_sentences = pd.DataFrame()

        sentences = [sentences_test[i] for i in sentence_ids_in_batch]

        annotated_sentences['index'] = sentence_ids_in_batch

        annotated_sentences['sentences'] = sentences

        annotated_sentences['labels'] = predicted_labels

        annotated_sentences['probs'] = predicted_probs

        return annotated_sentences
Пример #4
0
import sys


from model import MLTModel
from evaluator import MLTEvaluator
from experiment import read_input_files


if __name__ == "__main__":
    model = MLTModel.load(sys.argv[1])
    data = read_input_files(sys.argv[2], -1)
    batch_size = 32
    # Evaluator
    evaluator = MLTEvaluator(model.config)
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        cost, sentence_scores, token_scores_list = model.process_batch_inference(batch, False, 0.0)

        for j in range(len(batch)):
            for k in range(len(batch[j])):
                print(" ".join([str(x) for x in batch[j][k]]) + "\t" + str(token_scores_list[0][j][k]) + "\t" + str(sentence_scores[j]))
            print("")

        # Evaluator
        evaluator.append_data(cost, batch, sentence_scores, token_scores_list)

    # Evaluator
    results = evaluator.get_results("test")
    for key in results:
        sys.stderr.write(key + ": " + str(results[key]) + "\n")
def print_predictions(print_probs, model_path, input_file):
    time_loading = time.time()
    model = labeler.SequenceLabeler.load(model_path)

    time_noloading = time.time()
    config = model.config
    predictions_cache = {}

    num_additional_features = config['num_additional_features']
    num_additional_feature_vectors = config.get('num_additional_feature_vectors', 1)
    
    id2label = collections.OrderedDict()
    for label in model.label2id:
        id2label[model.label2id[label]] = label

    sentences_test = experiment.read_input_files(input_file)
    batches_of_sentence_ids = experiment.create_batches_of_sentence_ids(sentences_test, config["batch_equal_size"], config['max_batch_size'])

    feature_path = experiment.read_input_features(input_file, 'models/features/')

    for sentence_ids_in_batch in batches_of_sentence_ids:
        batch = [
            numpy.concatenate((sentences_test[i],
                               experiment.load_sentence_id(
                                   feature_path,
                                   i,
                                   num_additional_features, num_additional_feature_vectors)), axis=1) for i in sentence_ids_in_batch
        ]
        #batch = [sentences_test[i] for i in sentence_ids_in_batch]
        cost, predicted_labels, predicted_probs = model.process_batch(batch, is_training=False, learningrate=0.0)

        assert(len(sentence_ids_in_batch) == len(predicted_labels))

        for i in range(len(sentence_ids_in_batch)):
            key = str(sentence_ids_in_batch[i])
            predictions = []
            if print_probs == False:
                for j in range(len(predicted_labels[i])):
                    predictions.append(id2label[predicted_labels[i][j]])
            elif print_probs == True:
                for j in range(len(predicted_probs[i])):
                    p_ = ""
                    for k in range(len(predicted_probs[i][j])):
                        p_ += str(id2label[k]) + ":" + str(predicted_probs[i][j][k]) + "\t"
                    predictions.append(p_.strip())
            predictions_cache[key] = predictions

    sentence_id = 0
    word_id = 0
    with open(input_file, "r") as f:
        for line in f:
            if len(line.strip()) == 0:
                print("")
                if word_id == 0:
                    continue
                assert(len(predictions_cache[str(sentence_id)]) == word_id), str(len(predictions_cache[str(sentence_id)])) + " " + str(word_id)
                sentence_id += 1
                word_id = 0
                continue
            assert(str(sentence_id) in predictions_cache)
            assert(len(predictions_cache[str(sentence_id)]) > word_id)
            t, g, *_ = line.strip().split('\t')
            print('{}\t{}\tNaN\t{}'.format(t, g, predictions_cache[str(sentence_id)][word_id].strip()))
            word_id += 1
    
    sys.stderr.write("Processed: " + input_file + "\n")
    sys.stderr.write("Elapsed time with loading: " + str(time.time() - time_loading) + "\n")
    sys.stderr.write("Elapsed time without loading: " + str(time.time() - time_noloading) + "\n")