Пример #1
0
    def post(self):
        """
        Parse multiple string and return the associated entity for each token in each string.
        """
        args = self.parser.parse_args()
        ref_strings = args.get('strings')

        tokens = [[[token] for token in ref_string.split(" ")]
                  for ref_string in ref_strings]
        data = prepare_dataset(tokens, current_app.word_to_id,
                               current_app.char_to_id, {},
                               current_app.model.parameters['lower'], True)

        tagged = []

        for index, datum in enumerate(data):
            model_inputs = create_input(datum, current_app.model.parameters,
                                        False)
            y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
            tags = [
                current_app.model.id_to_tag[y_pred[i]]
                for i in range(len(y_pred))
            ]

            tagged.append([
                Entity(term=term, entity=entity)
                for term, entity in zip(ref_strings[index].split(" "), tags)
            ])

        response = ParseBatchResponse(reference_strings=ref_strings,
                                      data=tagged)
        return response
Пример #2
0
def training_loop():

    taskB_out_label = []
    for i, file in enumerate(taskB_in):
        output = []
        for j, sent in enumerate(file):
            with torch.no_grad():
                precheck_sent = utils.create_input(taskB_in[i][j].split(),
                                                   tokenizer).to(device)
                sent_out_label = model(precheck_sent)[1]
                sent_str_label = [biluo_decode[t] for t in sent_out_label]
                output.append(sent_str_label)
        taskB_out_label.append(output)

    true_pos = 0
    total_ph_pred = 0
    for i, file in enumerate(taskB_label):
        file_true_pos, file_total_ph_pred = eval_func(
            taskB_out_label[i],
            [["O"] + s.split() + ["O"] for s in taskB_label[i]])
        true_pos = true_pos + file_true_pos
        total_ph_pred = total_ph_pred + file_total_ph_pred

    precision = 0
    recall = 0
    F1score = 0
    if (total_ph_pred != 0):
        precision = true_pos / total_ph_pred
    if (total_phrases_truth != 0):
        recall = true_pos / total_phrases_truth
    if ((precision + recall) != 0):
        F1score = 2 * precision * recall / (precision + recall)
    print("Precision : {} | Recall : {} | F1 Score : {}".format(
        precision, recall, F1score))
Пример #3
0
    def post(self):
        """
        Parse a single string and return the associated entity for each token in the string.
        """
        args = self.parser.parse_args()
        ref_string = args.get('string')
        if ref_string is None or ref_string == "":
            # Hackish way as reqparse can't catch empty string
            abort(400, description='string is empty or not provided.')

        tokens = ref_string.split(" ")

        data = prepare_dataset([[[token] for token in tokens]],
                               current_app.word_to_id, current_app.char_to_id,
                               {}, current_app.model.parameters['lower'], True)

        model_inputs = create_input(data[0], current_app.model.parameters,
                                    False)
        y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1]
        tags = [
            current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred))
        ]

        response = ParseResponse(reference_string=ref_string,
                                 data=[
                                     Entity(term=term, entity=entity)
                                     for term, entity in zip(tokens, tags)
                                 ])
        return response
Пример #4
0
 def next_batch(self, bz):
     index = np.random.choice(len(self.memory), bz)
     memory = [self.memory[i] for i in index]
     state = [translate_state(i.get("state")) for i in memory]
     state_next = [translate_state(i.get("state_next")) for i in memory]
     action = [i.get("action") for i in memory]
     finish = [int(i.get("finish")) for i in memory]
     reward = [i.get("reward") for i in memory]
     result = {
         "state": create_input(state),
         "state_next": create_input(state_next),
         "action": action,
         "finish": finish,
         "reward": reward
     }
     return result
Пример #5
0
def validation_loop():

    valid_taskB_out_label = []
    for i, file in enumerate(valid_taskB_in):
        output = []
        for j, sent in enumerate(file):
            with torch.no_grad():
                precheck_sent = utils.create_input(
                    valid_taskB_in[i][j].split(), tokenizer).to(device)
                sent_out_label = model(precheck_sent)[1]
                sent_str_label = [biluo_decode[t] for t in sent_out_label]
                output.append(sent_str_label)
        valid_taskB_out_label.append(output)

    valid_true_pos = 0
    valid_total_ph_pred = 0
    for i, file in enumerate(valid_taskB_label):
        valid_file_true_pos, valid_file_total_ph_pred = eval_func(
            valid_taskB_out_label[i],
            [["O"] + s.split() + ["O"] for s in valid_taskB_label[i]])
        valid_true_pos = valid_true_pos + valid_file_true_pos
        valid_total_ph_pred = valid_total_ph_pred + valid_file_total_ph_pred

    valid_precision = 0
    valid_recall = 0
    valid_F1score = 0
    if (valid_total_ph_pred != 0):
        valid_precision = valid_true_pos / valid_total_ph_pred
    if (valid_total_phrases_truth != 0):
        valid_recall = valid_true_pos / valid_total_phrases_truth
    if ((valid_precision + valid_recall) != 0):
        valid_F1score = 2 * valid_precision * \
            valid_recall/(valid_precision+valid_recall)
    print("Precision : {} | Recall : {} | F1 Score : {}".format(
        valid_precision, valid_recall, valid_F1score))
Пример #6
0
def prepare_dataset(sentences,
                    word_to_id,
                    char_to_id,
                    gazetteer_list,
                    brown_dict,
                    tag_to_id,
                    l1_model,
                    l1_f_eval,
                    lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        str_words = [w[0] for w in s]

        words = [
            word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
            for w in str_words
        ]
        # Skip characters that are not in the training set
        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        caps = [cap_feature(w) for w in str_words]
        gazetteer = [gazetteer_feature(w, gazetteer_list) for w in str_words]
        brown = [brown_feature(w, brown_dict) for w in str_words]
        sent = {
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'caps': caps,
            'gazetteer': gazetteer,
            'brown': brown,
        }

        if l1_model is not None:
            input = create_input(sent, l1_model.parameters, False)
            try:
                if l1_model.parameters['crf']:
                    y_preds = np.array(f_eval(*input))[1:-1]
                else:
                    y_preds = f_eval(*input).argmax(axis=1)
                y_preds = [l1_model.id_to_tag[y_pred] for y_pred in y_preds]
            except Exception as e:
                y_preds = ["O"] * len(str_words)

            sent['pred'] = [0 if y_pred == "O" else 1 for y_pred in y_preds]

        tags = [tag_to_id[w[-1]] for w in s]
        sent['tags'] = tags
        data.append(sent)

    return data
Пример #7
0
def tag():
    if request.method == 'POST':
        data = request.get_json()
        text = data['text']
        if data['split_sentences']:
            sentences = split_sentences(text)
        else:
            sentences = text

        if data['tokenize'] or data['split_sentences']:
            tokenized_sentences = [tokenize(s) for s in sentences]
        else:
            tokenized_sentences = text

        count = 0
        output = []
        for words in tokenized_sentences:
            if len(words) == 0:
                continue
            # Lowercase sentence
            if model.parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if model.parameters['zeros']:
                line = zero_digits(line)
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=model.parameters['lower'])
            input = create_input(sentence, model.parameters, False)
            # Decoding
            if model.parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if model.parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(
                words
            ), "Predictions have different length than sentence. Something went wrong."
            output.append(list(zip(words, y_preds)))
            count += 1
            if count % 100 == 0:
                logging.info(count)

        return jsonify(output)
Пример #8
0
    def create_input(self):
        imgs = []
        wides = []

        for i,box in enumerate(self.all_after_row_connect):
            result = Box_Cell(box.bbox,self.img,self.types[i])
            if result.img_wide>self.max_wide:
                self.max_wide = result.img_wide
            wides.append(result.img_wide)
            imgs.append(result.normal_img)
            self.results.append(result)

        inputs,wides = utils.create_input(imgs,self.max_wide,wides)

        return inputs,wides
Пример #9
0
def tag(model, line):
    # Load existing model
    print("Loading model...")
    model = Model(model_path=model)
    parameters = model.parameters

    # Load reverse mappings
    word_to_id, char_to_id, tag_to_id = [{
        v: k
        for k, v in x.items()
    } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

    # Load the model
    _, f_eval = model.build(training=False, **parameters)
    model.reload()

    start = time.time()

    print('Tagging...')
    words_ini = line.rstrip().split()

    # Replace all digits with zeros
    if parameters['zeros']:
        line = zero_digits(line)
    words = line.rstrip().split()
    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)
    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)
    # Write tags
    assert len(y_preds) == len(words)

    print('---- sentence tagged in %.4fs ----' % (time.time() - start))

    return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
Пример #10
0
    def create_input(self):
        imgs = []
        wides = []

        def add_img(result):
            if result.img_wide>self.max_wide:
                self.max_wide = result.img_wide
            wides.append(result.img_wide)
            imgs.append(result.normal_img)


        for i,result in enumerate(self.all_after_row_connect):
            add_img(result)

        inputs,wides = utils.create_input(imgs,self.max_wide,wides)

        return inputs,wides
Пример #11
0
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id):
    count = 0
    all_ypreds = list()
    all_tokens = list()
    for line in doc.sentences:
        toks_text = [x.orth_ for x in line.tokens]
        # line = ' '.join(toks_text)
        if toks_text:  # WL edit: used to be 'if line', was crashing on '\n' lines
            # Lowercase sentence
            if parameters['lower']:
                toks_text = [line.lower() for line in toks_text]
            # Replace all digits with zeros
            if parameters['zeros']:
                toks_text = [zero_digits(line) for line in toks_text]
            # Prepare input
            sentence = prepare_sentence(toks_text,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(toks_text)

            # strip IOB prefixes
            y_preds = [x.split('-')[-1] for x in y_preds]

            all_ypreds.append(y_preds)
            all_tokens.append(toks_text)

        count += 1
        if count % 100 == 0:
            print count

    return (all_ypreds, all_tokens)
Пример #12
0
def prepare_sentence(str_words,
                     word_to_id,
                     char_to_id,
                     gazetteer_list={},
                     brown_dict={},
                     l1_model=None,
                     l1_f_eval=None,
                     lower=False):
    """
    Prepare a sentence for evaluation.
    """
    def f(x):
        return x.lower() if lower else x

    words = [
        word_to_id[f(w) if f(w) in word_to_id else '<UNK>'] for w in str_words
    ]
    chars = [[char_to_id[c] for c in w if c in char_to_id] for w in str_words]
    caps = [cap_feature(w) for w in str_words]
    gazetteer = [gazetteer_feature(w, gazetteer_list) for w in str_words]
    brown = [brown_feature(w, brown_dict) for w in str_words]
    sent = {
        'str_words': str_words,
        'words': words,
        'chars': chars,
        'caps': caps,
        'gazetteer': gazetteer,
        'brown': brown
    }
    if l1_model is not None:
        input = create_input(sent, l1_model.parameters, False)
        try:
            if l1_model.parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [l1_model.id_to_tag[y_pred] for y_pred in y_preds]
        except Exception as e:
            y_preds = ["O"] * len(str_words)

        sent['pred'] = [0 if y_pred == "O" else 1 for y_pred in y_preds]
    return sent
    def predicts(self, line):
        if line:
            # Save original bigrams
            bigram_sent = self.to_bigram(line, 0).strip().split()

            # Replave all digits with zeros
            line = zero_digits(line)
            input_seq = self.to_bigram(line, 0).strip().split()

            # Prepare input
            sentence = prepare_sentence(input_seq,
                                        self.word_to_id,
                                        self.char_to_id,
                                        lower=self.parameters['lower'])
            input = create_input(sentence, self.parameters, False)
            if self.parameters['crf']:
                y_preds = np.array(self.f_eval(*input))[1:-1]
            else:
                y_preds = self.f_eval(*input).argmax(axis=1)
            tags = [self.id_to_tag[y_pred] for y_pred in y_preds]

            # Output tags in the IOB2 format
            if self.parameters['tag_scheme'] == 'iobes':
                tags = iobes_iob(tags)
            print(tags)
            # Make output form
            out_form = ""
            unigram_sent = self.bigrams_to_unigrams(bigram_sent)

            for i in range(len(tags)):
                if tags[i].startswith('B'):
                    out_form += '<' + unigram_sent[i]
                elif tags[i].startswith('I'):
                    if i == len(tags) - 1:
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    elif tags[i + 1] == 'O':
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    else:
                        out_form += unigram_sent[i]
                else:
                    out_form += unigram_sent[i]
            return out_form
Пример #14
0
	def parseString(self, string):
		#TO DO
		#To be consumed by web-service
		test_file = "test_file"
		file = open(test_file, 'w')
		file.write('\n'.join(string.encode('utf-8').split()))
		file.close()
		test_sentences = load_sentences(test_file, self.lower, self.zeros)
		data = self.prepare_dataset(test_sentences)
		result = ''
		for citation in data:
			input = create_input(citation, self.model.parameters, False)
			y_pred = np.array(self.f[1](*input))[1:-1]
			tags = []
			for i in xrange(len(y_pred)):
				tags.append(self.model.id_to_tag[y_pred[i]])
			for num, word in enumerate(string.encode('utf-8').split()):
				#print word.decode('utf-8')+'\t'+tags[num]
				result += word.decode('utf-8')+'\t'+tags[num]+'\n'
		return result
def evaluate(ctx, model, env, rounds=5, print_action=False, save=None):
    env.reset_env()
    for epoch in range(rounds):
        env.reset_env()
        done = 0
        step = 0
        while not done:
            step += 1
            data = create_input([translate_state(env.map.state())])
            data = [torch.FloatTensor(i).to(ctx) for i in data]
            pred = model.forward(data)
            action = int(torch.argmax(pred).cpu().numpy())
            old, new, reward, done = env.step(action)
            if print_action:
                print(pred, reward, env.map.battery)
            if save is not None:
                img = Image.fromarray(env.map.render(), 'RGB')
                pred = [str(x)[0:5] for x in pred.detach().numpy().tolist()[0]]
                filename = "torch-" + str(epoch) + "-" + str(step) + "-" + str(
                    reward) + "-" + "_".join(pred) + ".jpg"
                img.save(save + "/" + filename)
    return env.detect_rate
def evaluate(ctx, model, env, rounds=5, print_action=False, save=None):
    for epoch in range(rounds):
        env.reset_env()
        done = 0
        step = 0
        while not done:
            step += 1
            data = create_input([translate_state(env.map.state())])
            data = [nd.array(i, ctx=ctx) for i in data]
            pred = model(data)
            action = int(nd.argmax(pred, axis=1).asnumpy()[0])
            old, new, reward, done = env.step(action)
            if print_action:
                print(pred, reward, env.map.battery)
            if save is not None:
                img = Image.fromarray(
                    env.map.grid.render(20, env.map.agent_pos,
                                        env.map.agent_dir), 'RGB')
                pred = [str(x)[0:5] for x in pred.asnumpy().tolist()[0]]
                filename = str(epoch) + "-" + str(step) + "-" + str(
                    reward) + "-" + "_".join(pred) + ".jpg"
                img.save(save + "/" + filename)
    return env.detect_rate
Пример #17
0
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
             id_to_tag, input_file_path):
    """
    Evaluate current model using CoNLL script.
    """
    predictions = []
    for raw_sentence, data in zip(raw_sentences, parsed_sentences):
        input = create_input(data, parameters, False)
        if parameters['crf']:
            y_preds = np.array(f_eval(*input))[1:-1]
        else:
            y_preds = f_eval(*input).argmax(axis=1)
        p_tags = [id_to_tag[y_pred] for y_pred in y_preds]

        if parameters['tag_scheme'] == 'iobes':
            p_tags = iobes_iob(p_tags)

        for i, y_pred in enumerate(y_preds):
            new_line = "%s %s" % (raw_sentence[i][0], p_tags[i])
            predictions.append(new_line)
        predictions.append("")
    output_path = os.path.join(opts.output, os.path.basename(input_file_path[:-4] + "_Tagged.txt"))
    with codecs.open(output_path, 'w', 'utf8') as f:
        f.write("\n".join(predictions))
Пример #18
0
        if tags[i][0] == 'O':
            if len(preTag):
                res.append("</" + preTag + ">")
                preTag = ""
        res.append(sentence[i])
    if len(preTag):
        res.append("</" + preTag + ">")
    return res


#}}}
print 'Tagging...'
for line in test_data:
    # Prepare input
    input = create_input(line,
                         parameters,
                         False,
                         useAttend=parameters['useAttend'])
    words = line['str_words']
    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)
    # Write tags
    assert len(y_preds) == len(words)

    #    print words
Пример #19
0
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words, word_to_id, char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(words)
            
            if opts.outputFormat == 'json':
                f_output.write(json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) }))
            else:
Пример #20
0
#
# Train network
#
singletons = set([word_to_id[k] for k, v
                  in dico_words_train.items() if v == 1])
n_epochs = 100  # number of epochs over the training set
freq_eval = 1000  # evaluate on dev every freq_eval steps
best_dev = -np.inf
best_test = -np.inf
count = 0
for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        input = create_input(train_data[index], parameters, True, singletons)
        new_cost = f_train(*input)
        epoch_costs.append(new_cost)
        if i % 50 == 0 and i > 0 == 0:
            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
        if count % freq_eval == 0:
            dev_score = evaluate(parameters, f_eval, dev_sentences,
                                 dev_data, id_to_tag, dico_tags)
            test_score = evaluate(parameters, f_eval, test_sentences,
                                  test_data, id_to_tag, dico_tags)
            print "Score on dev: %.5f" % dev_score
            print "Score on test: %.5f" % test_score
            if dev_score > best_dev:
                best_dev = dev_score
                print "New best score on dev."
                print "Saving model to disk..."
Пример #21
0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(words)

            if opts.outputFormat == 'json':
                f_output.write(
                    json.dumps({
def run_tagging(model,
                f_eval,
                parameters,
                word_to_id,
                char_to_id,
                tag_to_id,
                opts_input="",
                opts_output="",
                opts_delimiter="__",
                opts_outputFormat=""):
    # Check parameters validity
    assert opts_delimiter
    assert os.path.isfile(opts_input)

    #set environment to use gpu

    f_output = codecs.open(opts_output, 'w', 'utf-8')
    start = time.time()
    logger.info('Tagging...')
    with codecs.open(opts_input, 'r', 'utf-8') as f_input:
        count = 0
        for line in f_input:
            words_ini = line.rstrip().split()
            if line:
                # Lowercase sentence
                if parameters['lower']:
                    line = line.lower()
                # Replace all digits with zeros
                if parameters['zeros']:
                    line = zero_digits(line)
                words = line.rstrip().split()
                # Prepare input
                sentence = prepare_sentence(words,
                                            word_to_id,
                                            char_to_id,
                                            lower=parameters['lower'])
                input = create_input(sentence, parameters, False)
                # Decoding
                if parameters['crf']:
                    y_preds = np.array(f_eval(*input))[1:-1]
                else:
                    y_preds = f_eval(*input).argmax(axis=1)
                y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
                # Output tags in the IOB2 format
                if parameters['tag_scheme'] == 'iobes':
                    y_preds = iobes_iob(y_preds)
                # Write tags
                assert len(y_preds) == len(words)

                if opts_outputFormat == 'json':
                    f_output.write(
                        json.dumps({
                            "text": ' '.join(words),
                            "ranges": iob_ranges(y_preds)
                        }))
                else:
                    #logger.info( "write out tags..."
                    f_output.write(
                        '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y)
                                          for w, y in zip(words_ini, y_preds)))
            else:
                f_output.write('\n')
            count += 1
            # if count % 100 == 0:
            #     logger.info( count

    logger.info('---- %i lines tagged in %.4fs ----' %
                (count, time.time() - start))
    f_output.close()
    logger.info(opts_output)
    logger.info("")
    return opts_output + " has been tagged!"


# def main():
#     logger.info( "executed"

# if __name__ == '__main__':
#     main()
Пример #23
0
# Train network
#

count = 0

fvs = []
words = []
for i, index in enumerate(np.random.permutation(len(train_data))):
    fv = []
    
    if (train_data[index]['str_words'][0] in words):
        continue
    
    count += 1
    words.append(train_data[index]['str_words'][0])
    input = create_input(train_data[index], parameters, False)

    # Get gradients vector from model
    grads = f_eval(*input)
    grads_rev = f_eval_rev(*input)
    
    * Concatenate all gradients
    for grad in grads_rev:
        for g in grad:
            try:
                for s in g:
                    fv.append(s)
            except:
                fv.append(g)
                
    for grad in grads:
Пример #24
0
        string = raw_input("Enter the citation string: ")
        strings = [string]

    test_file = "test_file"
    if os.path.exists(test_file):
        os.remove(test_file)
    file = open(test_file, 'a')
    for string in strings:
        file.write('\n'.join(string.split()) + '\n')
    file.close()
    test_sentences = load_sentences(test_file, lower, zeros)
    data = prepare_dataset(test_sentences, word_to_id, char_to_id, {}, lower,
                           True)

    for citation in data:
        inputs = create_input(citation, model.parameters, False)
        y_pred = np.array(f[1](*inputs))[1:-1]

        tags = [model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]

        output = [
            w + '\t' + tags[i] for i, w in enumerate(citation['str_words'])
        ]

        if opts.run == 'file':
            with closing(open(output_file, 'w')) as fh:
                fh.write('\n'.join(output))
        else:
            print('\n'.join(output))

    if opts.run == 'file':
Пример #25
0
best_test = -np.inf
count = 0
# costfile = './cost_vec_' + str(parameters['word_dim']) + '_' + str(parameters['word_hidden_dim']) + str(parameters['L2_reg']) + '.txt'
# fw = codecs.open(costfile, 'w', 'utf-8')
# fw.write('epoch\t\ttrain_loss\t\tdev_los\t\ttest_loss\t\tdev_F1\t\ttest_F1\n')
F1_file = './vec_' + opts.tagger + "gaze_" + opts.dictionary + str(parameters['use_gaze']) + "char" + str(parameters['char_dim']) + "_" + str(parameters['char_hidden_dim']) \
          + "_word" + str(parameters['word_dim']) + "_" + str(parameters['word_hidden_dim']) + 'taggerhidden' + str(parameters['tagger_hidden_dim']) + '.txt'
fw = codecs.open(F1_file, 'w', 'utf-8')
fw.write("epoch\t\tdev_F1\t\ttest_F1\n")
for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
        
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        input = create_input(train_data[index], parameters, True, use_gaze, pos, singletons)
        new_cost = f_train(*input)

        epoch_costs.append(new_cost)
        if i % 50 == 0 and i > 0 == 0:
            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
        if count % freq_eval == 0:
            #train_score = evaluate(parameters, f_eval, train_sentences,
            #                     train_data, id_to_tag, dico_tags)
            # train_cost = []
            # for i, data in enumerate(train_data):
            #     input = create_input(data, parameters, True, use_gaze, pos, singletons)
            #     train_cost.append(f_plot_cost(*input))
            # dev_cost = []
            # for i, data in enumerate(dev_data):
            #     input = create_input(data, parameters, True, use_gaze, pos, singletons)
Пример #26
0
    hidden_layers=parameters['hidden_layer'],
    padding=parameters['padding'],
    max_seq_len=max_seq_len,
    train_size=len(train_data))

if parameters['reload']:
    gramcnn.load(models_path, model_name)

for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
    for i, index in enumerate(np.random.permutation(len(train_data))):
        inputs, word_len = create_input(train_data[index],
                                        parameters,
                                        True,
                                        singletons,
                                        padding=parameters['padding'],
                                        max_seq_len=max_seq_len,
                                        use_pts=parameters['pts'])

        assert inputs['char_for']
        assert inputs['word']
        assert inputs['label']

        # break
        if len(inputs['label']) == 1:
            continue
        train_loss = []
        temp = []
        temp.append(word_len)
        batch_loss = gramcnn.train(inputs, temp)
Пример #27
0
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim):
    #results File
    resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/"
    for u_dropout in dropout:
        for v_char_dim in char_dim:
            for w_char_lstm_dim in char_lstm_dim:
                for x_word_dim in word_dim:
                    for y_word_lstm_dim in word_lstm_dim:
                        for dataset in datasets:
                            print "+++++++++++++++"
                            print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset
                            parameters['dropout'] = u_dropout

                            parameters['char_dim'] = v_char_dim
                            parameters['char_lstm_dim'] =w_char_lstm_dim
                            parameters['word_dim'] = x_word_dim
                            parameters['word_lstm_dim'] = y_word_lstm_dim

                            # If dataset is DrugBank assign predefined path

                            if(dataset == "i2b2-2010"):
                                opts.train = i2b2BasePath+"train.txt"
                                opts.dev = i2b2BasePath+ "dev.txt"
                                opts.test = i2b2BasePath+ "test.txt"
                                resultsFile = resultsPath +"i2b2_2010_Results.txt"



                            # Initialize model
                            model = Model(parameters=parameters, models_path=models_path)
                            print "Model location: %s" % model.model_path

                            # Data parameters
                            lower = parameters['lower']
                            zeros = parameters['zeros']
                            tag_scheme = parameters['tag_scheme']

                            # Load sentences
                            train_sentences = loader.load_sentences(opts.train, lower, zeros)
                            dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
                            test_sentences = loader.load_sentences(opts.test, lower, zeros)

                            # Use selected tagging scheme (IOB / IOBES)
                            update_tag_scheme(train_sentences, tag_scheme)
                            update_tag_scheme(dev_sentences, tag_scheme)
                            update_tag_scheme(test_sentences, tag_scheme)

                            # Create a dictionary / mapping of words
                            # If we use pretrained embeddings, we add them to the dictionary.
                            if parameters['pre_emb']:
                                dico_words_train = word_mapping(train_sentences, lower)[0]
                                dico_words, word_to_id, id_to_word = augment_with_pretrained(
                                    dico_words_train.copy(),
                                    parameters['pre_emb'],
                                    list(itertools.chain.from_iterable(
                                        [[w[0] for w in s] for s in dev_sentences + test_sentences])
                                    ) if not parameters['all_emb'] else None
                                )
                            else:
                                dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
                                dico_words_train = dico_words

                            # Create a dictionary and a mapping for words / POS tags / tags
                            dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
                            dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

                            print "Calling the prepare_dataset :--"
                            # Index data
                            train_data = prepare_dataset(
                                train_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            dev_data = prepare_dataset(
                                dev_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            test_data = prepare_dataset(
                                test_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )

                            print "%i / %i / %i sentences in train / dev / test." % (
                                len(train_data), len(dev_data), len(test_data))

                            # Save the mappings to disk
                            print 'Saving the mappings to disk...'
                            model.save_mappings(id_to_word, id_to_char, id_to_tag)

                            # Build the model
                            f_train, f_eval = model.build(**parameters)

                            # Reload previous model values
                            if opts.reload:
                                print 'Reloading previous model...'
                                model.reload()


                            # Train network
                            #
                            singletons = set([word_to_id[k] for k, v
                                              in dico_words_train.items() if v == 1])
                            n_epochs = 2  # number of epochs over the training set
                            freq_eval = 1000  # evaluate on dev every freq_eval steps
                            best_dev = -np.inf
                            best_test = -np.inf
                            count = 0
                            for epoch in xrange(n_epochs):
                                epoch_costs = []
                                print "Starting epoch %i..." % epoch
                                for i, index in enumerate(np.random.permutation(len(train_data))):
                                    count += 1
                                    input = create_input(train_data[index], parameters, True, singletons)
                                    new_cost = f_train(*input)
                                    epoch_costs.append(new_cost)
                                    #if i % 50 == 0 and i > 0 == 0:
                                    #    print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
                                    if count % freq_eval == 0:
                                        dev_score = evaluate(parameters, f_eval, dev_sentences,
                                                             dev_data, id_to_tag, dico_tags)
                                        test_score = evaluate(parameters, f_eval, test_sentences,
                                                              test_data, id_to_tag, dico_tags)
                                        print "Score on dev: %.5f" % dev_score
                                        print "Score on test: %.5f" % test_score
                                        if dev_score > best_dev:
                                            best_dev = dev_score
                                            print "New best score on dev."+str(best_dev)
                                            # print "Saving model to disk..."
                                            # model.save()
                                        if test_score > best_test:
                                            best_test = test_score
                                            print "New best score on test."+str(best_test)
                                        # print "Config values used are : "


                                print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
                            # Write the best dev and test scores to the file
                            del model


                            with open(resultsFile, 'a') as f:
                                    f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim:  |"+str(parameters['char_dim'])+ "| char_lstm_dim:  "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n")


    return
Пример #28
0
    def train(self,
              n_epochs=100,
              freq_eval=1000,
              verbose=True,
              eval_test_set=False):
        """
        :param n_epochs: number of epochs over the training set
        :param freq_eval: evaluate on dev every freq_eval steps
        :return: Saves the model with the best F1-Score, evaluated on the dev set
        """
        # Initialize model
        model = Model(parameters=self.parameters, models_path=models_path)
        print("Model location: %s" % model.model_path)

        # Data parameters
        lower = self.parameters['lower']
        zeros = self.parameters['zeros']
        tag_scheme = self.parameters['tag_scheme']

        # Load sentences
        train_sentences = loader.load_sentences(self.parameters['train'],
                                                lower, zeros)
        dev_sentences = loader.load_sentences(self.parameters['dev'], lower,
                                              zeros)
        test_sentences = loader.load_sentences(self.parameters['test'], lower,
                                               zeros)

        # Use selected tagging scheme (IOB / IOBES)
        update_tag_scheme(train_sentences, tag_scheme)
        update_tag_scheme(dev_sentences, tag_scheme)
        update_tag_scheme(test_sentences, tag_scheme)

        # Create a dictionary / mapping of words
        # If we use pretrained embeddings, we add them to the dictionary.
        if self.parameters['pre_emb']:
            dico_words_train = word_mapping(train_sentences, lower)[0]
            dico_words, word_to_id, id_to_word = augment_with_pretrained(
                dico_words_train.copy(), self.parameters['pre_emb'],
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in dev_sentences +
                                                   test_sentences]))
                if not self.parameters['all_emb'] else None)
        else:
            dico_words, word_to_id, id_to_word = word_mapping(
                train_sentences, lower)
            dico_words_train = dico_words

        # Create a dictionary and a mapping for words / POS tags / tags
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # Index data
        train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                     tag_to_id, lower)
        dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                                   tag_to_id, lower)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, lower)

        print("%i / %i / %i sentences in train / dev / test." %
              (len(train_data), len(dev_data), len(test_data)))

        # Save the mappings to disk
        print('Saving the mappings to disk...')
        model.save_mappings(id_to_word, id_to_char, id_to_tag)

        # Build the model
        f_train, f_eval = model.build(**self.parameters)

        # Reload previous model values
        if self.parameters['reload']:
            print('Reloading previous model...')
            model.reload()

        #
        # Train network
        #
        singletons = set(
            [word_to_id[k] for k, v in dico_words_train.items() if v == 1])
        best_dev = -np.inf
        best_test = -np.inf
        count = 0
        for epoch in range(n_epochs):
            epoch_costs = []
            print("Starting epoch %i at..." % epoch, time.ctime())
            for i, index in enumerate(np.random.permutation(len(train_data))):
                count += 1
                input = create_input(train_data[index], self.parameters, True,
                                     singletons)
                new_cost = f_train(*input)
                epoch_costs.append(new_cost)
                if i % 50 == 0 and i > 0 == 0 and verbose:
                    print("%i, cost average: %f" %
                          (i, np.mean(epoch_costs[-50:])))
                if count % freq_eval == 0:
                    dev_score = evaluate(self.parameters,
                                         f_eval,
                                         dev_sentences,
                                         dev_data,
                                         id_to_tag,
                                         verbose=verbose)
                    if eval_test_set:
                        test_score = evaluate(self.parameters,
                                              f_eval,
                                              test_sentences,
                                              test_data,
                                              id_to_tag,
                                              verbose=verbose)
                    print("Score on dev: %.5f" % dev_score)
                    if eval_test_set:
                        print("Score on test: %.5f" % test_score)
                    if dev_score > best_dev:
                        best_dev = dev_score
                        print("New best score on dev.")
                        print("Saving model to disk...")
                        model.save()
                    if eval_test_set:
                        if test_score > best_test:
                            best_test = test_score
                            print("New best score on test.")
            print(
                "Epoch %i done. Average cost: %f. Ended at..." %
                (epoch, np.mean(epoch_costs)), time.ctime())
        return best_dev
Пример #29
0
    for phase in ['train', 'dev', 'test'][:]:
        if phase == 'train':
            optimizer = exp_lr_scheduler(optimizer_ft, epoch,
                                         **lr_method_parameters)
            model.train(True)  # Set model to training mode
            random.shuffle(dataset[phase])
        else:
            model.train(False)  # Set model to evaluate mode

        epoch_loss = []

        # Iterate over data.
        preds = []
        for i in range(0, len(dataset[phase]), batch_size):
            inputs, seq_index_mapping, char_index_mapping, seq_len, char_len = \
                create_input(dataset[phase][i:i+batch_size], parameters)

            # forward
            outputs, loss = model.forward(inputs, seq_len, char_len,
                                          char_index_mapping)
            try:
                epoch_loss.append(loss.data[0])
            except AttributeError:
                pass

            # backward + optimize only if in training phase
            if phase == 'train':
                # zero the parameter gradients
                optimizer.zero_grad()

                loss.backward()
Пример #30
0
#
# Train network
#
singletons = set([word_to_id[k] for k, v
                  in dico_words_train.items() if v == 1])
n_epochs = 100  # number of epochs over the training set
freq_eval = 1000  # evaluate on dev every freq_eval steps
best_dev = -np.inf
best_test = -np.inf
count = 0
for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        input = create_input(train_data[index], parameters, True, singletons)
        new_cost = f_train(*input)
        epoch_costs.append(new_cost)
        if i % 50 == 0 and i > 0 == 0:
            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
        if count % freq_eval == 0:
            dev_score = evaluate(parameters, f_eval, dev_sentences,
                                 dev_data, id_to_tag, dico_tags)
            test_score = evaluate(parameters, f_eval, test_sentences,
                                  test_data, id_to_tag, dico_tags)
            print "Score on dev: %.5f" % dev_score
            print "Score on test: %.5f" % test_score
            if dev_score > best_dev:
                best_dev = dev_score
                print "New best score on dev."
                print "Saving model to disk..."
Пример #31
0
    if (valid_total_ph_pred != 0):
        valid_precision = valid_true_pos / valid_total_ph_pred
    if (valid_total_phrases_truth != 0):
        valid_recall = valid_true_pos / valid_total_phrases_truth
    if ((valid_precision + valid_recall) != 0):
        valid_F1score = 2 * valid_precision * \
            valid_recall/(valid_precision+valid_recall)
    print("Precision : {} | Recall : {} | F1 Score : {}".format(
        valid_precision, valid_recall, valid_F1score))


# Main Training Loop

with torch.no_grad():

    precheck_sent = utils.create_input(taskB_in[0][0].split(),
                                       tokenizer).to(device)
    precheck_tags = torch.tensor(
        [4] + [biluo_code[t] for t in taskB_label[0][0].split()] + [4],
        dtype=torch.long).to(device)
    print("Checkpoint reached! Starting model training......")

for epoch in range(4):
    start = time.time()
    model.train()

    for i, file in enumerate(taskB_in):
        try:
            if (i % 10 == 0):
                print(f"done with {i} of {len(taskB_in)}")
            for j, sent in enumerate(file):
Пример #32
0
#
# Train network
#

n_epochs = 50  # number of epochs over the training set
freq_eval = 500  # evaluate on dev every freq_eval steps
best_dev = -np.inf
best_test = -np.inf
count = 0
for epoch in xrange(n_epochs):
    epoch_costs = []
    print "Starting epoch %i..." % epoch
    for i, index in enumerate(np.random.permutation(len(train_data))):
        count += 1
        input = create_input(train_data[index], parameters, True, False if pos_tag==0 else True)
        new_cost = f_train(*input)
        epoch_costs.append(new_cost)
        if i % 50 == 0 and i > 0 == 0:
            print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
        if count % freq_eval == 0:
            dev_score, pred_dev = evaluate_scope(parameters, model.model_path, f_eval, dev_data, id_to_y, False if pos_tag==0 else True)
            if dev_score > best_dev:
                best_dev = dev_score
                print "New best score on dev."
                print "Saving model to disk..."
                model.save()
                # Store predictions to disk
                output_predDEV = os.path.join(model.model_path, "best_dev.output")
                with codecs.open(output_predDEV, 'w', 'utf8') as f:
                    f.write("\n".join(pred_dev))
Пример #33
0
def ner():
    global model
    global f_eval
    global parameters
    global word_to_id
    global char_to_id
    global tag_to_id
    model_name = request.json["model"]
    words = request.json["words"]
    begin_end = request.json["begin_end"]
    if model is None:
        ## Model loading
        print "Loading model " + model_name + ".."
        model = Model(model_path="models/" + models[model_name])
        parameters = model.parameters

        # Load reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

        # Load the model
        _, f_eval = model.build(training=False, **parameters)
        model.reload()
#     else:
#         parameters = model.parameters
#         word_to_id, char_to_id, tag_to_id = [
#             {v: k for k, v in x.items()}
#             for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
#         ]

# Lowercase sentence
    if parameters['lower']:
        words = [w.lower() for w in words]
    # Replace all digits with zeros
    if parameters['zeros']:
        words = [zero_digits(w) for w in words]
    words = [w if not w.isupper() else w.title() for w in words]

    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)

    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]

    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)

    # Write tags
    assert len(y_preds) == len(words)  # TODO:remove assert?

    ents = [{
        "start_char": b,
        "end_char": e,
        "label": label
    } for (b, e), label in zip(begin_end, y_preds) if label != "O"]

    return json.dumps({"ents": ents})
Пример #34
0
def test_inference_performance():
    from sklearn.metrics import f1_score
    from torchtext.datasets import SequenceTaggingDataset
    from torchtext.data import Field, NestedField

    WORD = Field(init_token='<bos>', eos_token='<eos>')
    CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
    CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
    ENTITY = Field(init_token='<bos>', eos_token='<eos>')

    data_file = tempfile.NamedTemporaryFile(delete=True)

    # TODO Need to be decoded in Python 3
    data_file.write(requests.get(CORA_URL).content)

    fields = [(('text', 'char'),
               (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]

    dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")

    model = Model(model_path='models/neuralParsCit')
    model.parameters['pre_emb'] = os.path.join(os.getcwd(),
                                               'vectors_with_unk.kv')
    f = model.build(training=False, **model.parameters)

    model.reload()

    word_to_id = {v: i for i, v in model.id_to_word.items()}
    char_to_id = {v: i for i, v in model.id_to_char.items()}
    tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}

    tf = tempfile.NamedTemporaryFile(delete=False)
    tf.write("\n\n".join(
        ["\n".join(example.text) for example in dataset.examples]))
    tf.close()

    train_sentences = load_sentences(tf.name, model.parameters['lower'],
                                     model.parameters['zeros'])

    train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                   model.parameters['lower'], True)

    preds = []

    for citation in train_inputs:
        inputs = create_input(citation, model.parameters, False)
        y_pred = np.array(f[1](*inputs))[1:-1]

        preds.append([(w, y_pred[i])
                      for i, w in enumerate(citation['str_words'])])

    assert len(preds) == len(dataset.examples)

    results = []

    for P, T in zip(preds, dataset.examples):
        for p, t in zip(P, zip(T.text, T.entity)):
            results.append((p[1], tag_to_id[t[1]]))

    pred, true = zip(*results)

    eval_metrics = {
        'micro_f1': f1_score(true, pred, average='micro'),
        'macro_f1': f1_score(true, pred, average='macro')
    }

    data_file.close()

    assert eval_metrics == pytest.approx({
        'macro_f1': 0.984,
        'micro_f1': 0.993
    },
                                         abs=0.001)