예제 #1
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
            else:
                word = line.split()
            assert len(word) >= 2, print([word[0]])
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #2
0
def load_sentences(path, lower, zeros, plus_tag=False):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf-8'):
        #zerose가 1이면 존재하는 숫자 모두 0으로 바꿈
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:  #문장 사이 개행 부분
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:  #문장의 각 단어 라
            word = line.split()
            assert len(word) >= 2
            if plus_tag:
                word_tag = word[0] + '/' + word[1]
                word.insert(0, word_tag)
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #3
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """

    sentences = []
    sentence = []
    max_sentence_length = 0
    max_word_length = 0

    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    # print sentence
                    # sys.exit()
                    sentences.append(sentence)
                    if len(sentence) > max_sentence_length:
                        max_sentence_length = len(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
            if len(word[0]) > max_word_length:
                max_word_length = len(word[0])
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
            if len(sentence) > max_sentence_length:
                max_sentence_length = len(sentence)
    return sentences, max_sentence_length, max_word_length
예제 #4
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            if len(word) < 2:
                print path
                print line
                print word
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #5
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    ind = 0
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        line = line.replace('creative-work', 'creativework')
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            if len(word) < 6:
                print line, ind, path
            assert len(word) == 6
            sentence.append(word)
        ind += 1
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #6
0
def load_sentences2(path, lower, zeros, line_idx):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    start_line_idx = line_idx
    # return every 300 sentences
    read_lines = open(path, 'r', encoding = 'utf-8').readlines()
    leng = get_tot_length(path)
    #while len(sentences)< 300 and line_idx < leng:
    while line_idx -start_line_idx <= 10000:
        line = read_lines[line_idx]
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
        line_idx+= 1

    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    
    return sentences, line_idx
예제 #7
0
def load_sentences(path, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    label = []
    labels = []
    for line in codecs.open(path, 'r', 'utf-8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) >= 2:
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word[0])
            label.append(word[3])
    if len(sentence) >= 2:
        sentences.append(sentence)
        labels.append(label)

    return sentences,labels
예제 #8
0
def read_CONLL(path, zeros=True, lower=True, pos=False):
    sentences = []
    sentence = []
    idx = 0
    for line in codecs.open(path, 'r', 'utf8'):
        idx += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            if len(word) < 2:
                print(idx, line)
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)

    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    if pos:
        tags = [[w[-2] for w in s] for s in sentences]
    else:
        tags = [[w[-1] for w in s] for s in sentences]
    #tags = [[w[1] for w in s] for s in sentences]
    return words, tags
예제 #9
0
def load_sentences(input_file_path_or_list, zeros, file_format="conll"):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """

    assert file_format in ["conll", "conllu"]

    sentences = []
    sentence = []
    max_sentence_length = 0
    max_word_length = 0

    if isinstance(input_file_path_or_list, str):
        input_f = codecs.open(input_file_path_or_list, 'r', 'utf8')
    else:
        input_f = input_file_path_or_list

    if file_format == "conllu":
        sep = '\t'
    elif file_format == "conll":
        sep = None

    for line in input_f:
        if file_format == "conllu" and line.startswith("#"):
            continue
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    # print sentence
                    # sys.exit()
                    sentences.append(sentence)
                    if len(sentence) > max_sentence_length:
                        max_sentence_length = len(sentence)
                sentence = []
        else:
            tokens = line.split(sep)
            if file_format == "conll":
                assert len(tokens) >= 2
            elif file_format == "conllu":
                assert len(tokens) == 10, line + " " + " ".join(
                    tokens) + " CONLL-U format requires exactly 10 columns"
                if "-" in tokens[
                        0]:  # skip if the first column contains '-' as this indicates that this line is irrelavant for us.
                    continue
            sentence.append(tokens)
            if len(tokens[0]) > max_word_length:
                max_word_length = len(tokens[0])
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
            if len(sentence) > max_sentence_length:
                max_sentence_length = len(sentence)
    return sentences, max_sentence_length, max_word_length
예제 #10
0
def tag():
    if request.method == 'POST':
        data = request.get_json()
        text = data['text']
        if data['split_sentences']:
            sentences = split_sentences(text)
        else:
            sentences = text

        if data['tokenize'] or data['split_sentences']:
            tokenized_sentences = [tokenize(s) for s in sentences]
        else:
            tokenized_sentences = text

        count = 0
        output = []
        for words in tokenized_sentences:
            if len(words) == 0:
                continue
            # Lowercase sentence
            if model.parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if model.parameters['zeros']:
                line = zero_digits(line)
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=model.parameters['lower'])
            input = create_input(sentence, model.parameters, False)
            # Decoding
            if model.parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if model.parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(
                words
            ), "Predictions have different length than sentence. Something went wrong."
            output.append(list(zip(words, y_preds)))
            count += 1
            if count % 100 == 0:
                logging.info(count)

        return jsonify(output)
예제 #11
0
def tag(model, line):
    # Load existing model
    print("Loading model...")
    model = Model(model_path=model)
    parameters = model.parameters

    # Load reverse mappings
    word_to_id, char_to_id, tag_to_id = [{
        v: k
        for k, v in x.items()
    } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

    # Load the model
    _, f_eval = model.build(training=False, **parameters)
    model.reload()

    start = time.time()

    print('Tagging...')
    words_ini = line.rstrip().split()

    # Replace all digits with zeros
    if parameters['zeros']:
        line = zero_digits(line)
    words = line.rstrip().split()
    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)
    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)
    # Write tags
    assert len(y_preds) == len(words)

    print('---- sentence tagged in %.4fs ----' % (time.time() - start))

    return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
예제 #12
0
def load_sentences(path, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.

    Returns a list of list of lists:
    [
        [
            [sent1_word1, . . . , sent1_tag1]
            .
            .
            .
            [sentn_wordn, . . . , sentn_tagn]
        ]
        .
        .
        .
        [
            [sentl_word1, . . . , sentl_tag1]
            .
            .
            .
            [sentl_wordn, . . . , sentl_tagn]
        ]
    ]
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #13
0
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id):
    count = 0
    all_ypreds = list()
    all_tokens = list()
    for line in doc.sentences:
        toks_text = [x.orth_ for x in line.tokens]
        # line = ' '.join(toks_text)
        if toks_text:  # WL edit: used to be 'if line', was crashing on '\n' lines
            # Lowercase sentence
            if parameters['lower']:
                toks_text = [line.lower() for line in toks_text]
            # Replace all digits with zeros
            if parameters['zeros']:
                toks_text = [zero_digits(line) for line in toks_text]
            # Prepare input
            sentence = prepare_sentence(toks_text,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(toks_text)

            # strip IOB prefixes
            y_preds = [x.split('-')[-1] for x in y_preds]

            all_ypreds.append(y_preds)
            all_tokens.append(toks_text)

        count += 1
        if count % 100 == 0:
            print count

    return (all_ypreds, all_tokens)
    def predicts(self, line):
        if line:
            # Save original bigrams
            bigram_sent = self.to_bigram(line, 0).strip().split()

            # Replave all digits with zeros
            line = zero_digits(line)
            input_seq = self.to_bigram(line, 0).strip().split()

            # Prepare input
            sentence = prepare_sentence(input_seq,
                                        self.word_to_id,
                                        self.char_to_id,
                                        lower=self.parameters['lower'])
            input = create_input(sentence, self.parameters, False)
            if self.parameters['crf']:
                y_preds = np.array(self.f_eval(*input))[1:-1]
            else:
                y_preds = self.f_eval(*input).argmax(axis=1)
            tags = [self.id_to_tag[y_pred] for y_pred in y_preds]

            # Output tags in the IOB2 format
            if self.parameters['tag_scheme'] == 'iobes':
                tags = iobes_iob(tags)
            print(tags)
            # Make output form
            out_form = ""
            unigram_sent = self.bigrams_to_unigrams(bigram_sent)

            for i in range(len(tags)):
                if tags[i].startswith('B'):
                    out_form += '<' + unigram_sent[i]
                elif tags[i].startswith('I'):
                    if i == len(tags) - 1:
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    elif tags[i + 1] == 'O':
                        out_form += unigram_sent[i] + ':' + tags[i][2:] + '>'
                    else:
                        out_form += unigram_sent[i]
                else:
                    out_form += unigram_sent[i]
            return out_form
예제 #15
0
def load_sentences(path, zeros, lower):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        if not line.rstrip():
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = zero_digits(line.rstrip().split()) if zeros else line.rstrip().split()
            assert len(word) >= 2
	    #word = ['!' if w in ('-',',',',','。','.','>','?', ':', ':') else w for w in word]
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #16
0
def load_ner2line_sentences(path, lower=False, zeros = False):
    sentences = []
    tokens = []
    marks = []
    istoken = True
    len_mention = 0
    for line in codecs.open(path, 'r', 'utf-8'):
        line = line.strip()
        if line.startswith('#begin document'):
            logging.info('Skip line %s' % line)
            continue
        if len(line) > 0:
            if istoken:
                line = zero_digits(line) if zeros else line
                for tok in line.split():
                    tokens.append(tok)
                istoken = False
            else:
                for mark in line.split():
                    index = mark.split(',')
                    if len(index) < 4:
                        start, end, label, head = int(index[0]), int(index[1]), 'M', -1
                    else:
                        start, end, label, head = int(index[0]), int(index[1]), index[2],int(index[3])
                    label = mention_type(label)
                    if end <= start or end > len(tokens):
                        logging.info(
                            "WARNING: markable boundary out of sentence, sentence length: %d markable: %d, %d" % (
                            len(tokens), start, end))
                    else:
                        marks.append((start, end - 1, label,head))
                        len_mention += 1
        else:
            if len(tokens) > 0:
                sentences.append({'tokens': tokens, 'marks': marks})
            tokens = []
            marks = []
            istoken = True
    return sentences, len_mention
예제 #17
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain only one word of the citation string
    while running the model and the word and corresponding tag while training.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'cp850'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #18
0
파일: loader.py 프로젝트: natemccoy/tagger
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by tabs.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split('\t')
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #19
0
def load_sentences(path, lower=False, zeros=False):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in open(path, 'r'):
        line = line.strip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            word[0] = zero_digits(word[0]) if zeros else word[0]
            # assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #20
0
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in codecs.open(path, 'r', 'utf8'):
        if not line.rstrip():
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = zero_digits(line.rstrip().split()) if zeros else line.rstrip().split()
            if lower:
                word = [re.sub('[\u0061-\u007a]', 'a', re.sub('[\u0041-\u005a]', 'a', word[0])), word[1], word[2]]
            assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #21
0
def load_data(path, zeros):
    """
    Load sentences from path (data set). Sentences are separated by empty lines.
    You can replace all digits to zeros if you want.
    """
    sentences = []
    sentence = []
    for line in open(path, 'r', encoding='UTF-8'):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if line:
            word = line.split()
            assert len(word) >=2
            sentence.append(word)
        else:
            if 'DOCSTART' not in sentence[0][0]:
                sentences.append(sentence)
            sentence = []
    
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
                sentences.append(sentence)
    
    return sentences
예제 #22
0
파일: loader.py 프로젝트: ssharoff/cognates
def load_sentences(path, lower, zeros):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for i, line in enumerate(codecs.open(path, 'r', 'utf8')):
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split()
            assert len(word) >= 2, 'Error in describing %s, line %d in %s' % (
                line, i, path)
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences
예제 #23
0
def evaluate(test_file, models, tmp_folder, result_file):
    """
    :param test_file: conll format contains gold label of all layers,
    :param models: list of model path
    :param tmp_folder: temp folder
    :param result_file: result file
    :return:
    """
    input = test_file

    file_name = os.path.basename(test_file)
    lines = common.get_all_lines(input)
    newlines = []
    nlayer = len(models)
    for line in lines:
        if line.strip() != "":
            newline = "\t".join(line.strip().split("\t")[:-nlayer] +
                                ["O"])  # remove all gold label + append O tag
            newlines.append(newline)
        else:
            newlines.append("")

    input_files = [
        tmp_folder + "/" + file_name + ".temp.layer" + str(i)
        for i in range(nlayer + 1)
    ]
    eval_files = [
        tmp_folder + "/" + file_name + ".eval" + str(i) for i in range(nlayer)
    ]
    result_files = [
        tmp_folder + "/" + file_name + ".result.layer" + str(i) + ".txt"
        for i in range(nlayer)
    ]

    save_all_lines(newlines, input_files[0])  # create input file of layer 0

    eval_lines_all = []
    nlayer = len(models)
    print "#LAYER:", nlayer
    for i in range(nlayer):

        predict_a_file(input_files[i], input_files[i + 1], models[i], True)

        lines_i = common.get_all_lines(input_files[i + 1])
        print "#LEN: ", len(lines_i)

        # output conll to conlleval
        lines_eval = []
        for k in range(len(lines)):
            if lines[k].strip() != "":
                tokens1 = lines[k].strip().split("\t")
                tokens2 = lines_i[k].strip().split("\t")
                assert zero_digits(tokens1[0]) == zero_digits(tokens2[0])
                lines_eval.append(" ".join(
                    [tokens1[0], tokens1[i - nlayer], tokens2[-2]]))
            else:
                lines_eval.append("")

        save_all_lines(lines_eval, eval_files[i])
        eval_lines = call_conlleval(eval_files[i], result_files[i])
        eval_lines_all.append("=======================")
        eval_lines_all.append("test file: " + test_file)
        eval_lines_all.append("layer" + str(i))
        eval_lines_all.extend(eval_lines)

    result_file.write("\n".join(eval_lines_all))
예제 #24
0
f_output = codecs.open(opts.output, 'w', 'utf-8')
start = time.time()

print 'Tagging...'
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words,
                                        word_to_id,
                                        char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
예제 #25
0
파일: tagger.py 프로젝트: glample/tagger
f_output = codecs.open(opts.output, 'w', 'utf-8')
start = time.time()

print 'Tagging...'
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words_ini = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            words = line.rstrip().split()
            # Prepare input
            sentence = prepare_sentence(words, word_to_id, char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
def run_tagging(model,
                f_eval,
                parameters,
                word_to_id,
                char_to_id,
                tag_to_id,
                opts_input="",
                opts_output="",
                opts_delimiter="__",
                opts_outputFormat=""):
    # Check parameters validity
    assert opts_delimiter
    assert os.path.isfile(opts_input)

    #set environment to use gpu

    f_output = codecs.open(opts_output, 'w', 'utf-8')
    start = time.time()
    logger.info('Tagging...')
    with codecs.open(opts_input, 'r', 'utf-8') as f_input:
        count = 0
        for line in f_input:
            words_ini = line.rstrip().split()
            if line:
                # Lowercase sentence
                if parameters['lower']:
                    line = line.lower()
                # Replace all digits with zeros
                if parameters['zeros']:
                    line = zero_digits(line)
                words = line.rstrip().split()
                # Prepare input
                sentence = prepare_sentence(words,
                                            word_to_id,
                                            char_to_id,
                                            lower=parameters['lower'])
                input = create_input(sentence, parameters, False)
                # Decoding
                if parameters['crf']:
                    y_preds = np.array(f_eval(*input))[1:-1]
                else:
                    y_preds = f_eval(*input).argmax(axis=1)
                y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
                # Output tags in the IOB2 format
                if parameters['tag_scheme'] == 'iobes':
                    y_preds = iobes_iob(y_preds)
                # Write tags
                assert len(y_preds) == len(words)

                if opts_outputFormat == 'json':
                    f_output.write(
                        json.dumps({
                            "text": ' '.join(words),
                            "ranges": iob_ranges(y_preds)
                        }))
                else:
                    #logger.info( "write out tags..."
                    f_output.write(
                        '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y)
                                          for w, y in zip(words_ini, y_preds)))
            else:
                f_output.write('\n')
            count += 1
            # if count % 100 == 0:
            #     logger.info( count

    logger.info('---- %i lines tagged in %.4fs ----' %
                (count, time.time() - start))
    f_output.close()
    logger.info(opts_output)
    logger.info("")
    return opts_output + " has been tagged!"


# def main():
#     logger.info( "executed"

# if __name__ == '__main__':
#     main()
예제 #27
0
파일: ner.py 프로젝트: yyaghoobzadeh/ner
_, f_eval = model.build(training=False, **parameters)
model.reload()

while True:
    sent = raw_input("Type a query (type \"exit\" to exit):\n")
    count = 0
    words = sent.rstrip().split()
    if sent == 'exit':
        break
    else:
        # Lowercase sentence
        if parameters['lower']:
            sent = sent.lower()
        # Replace all digits with zeros
        if parameters['zeros']:
            sent = zero_digits(sent)
            # Prepare input
        sentence = prepare_sentence(words,
                                    word_to_id,
                                    char_to_id,
                                    lower=parameters['lower'])
        input = create_input(sentence, parameters, False)
        # Decoding
        y_preds = f_eval(*input).argmax(axis=1)
        y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
        # Output tags in the IOB2 format
        # Write tags
        assert len(y_preds) == len(words)
        print(
            '%s\n\n' % '\n'.join('%s%s%s' % (w, " ", y)
                                 for w, y in zip(words, y_preds)))
예제 #28
0
def ner():
    global model
    global f_eval
    global parameters
    global word_to_id
    global char_to_id
    global tag_to_id
    model_name = request.json["model"]
    words = request.json["words"]
    begin_end = request.json["begin_end"]
    if model is None:
        ## Model loading
        print "Loading model " + model_name + ".."
        model = Model(model_path="models/" + models[model_name])
        parameters = model.parameters

        # Load reverse mappings
        word_to_id, char_to_id, tag_to_id = [{
            v: k
            for k, v in x.items()
        } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]]

        # Load the model
        _, f_eval = model.build(training=False, **parameters)
        model.reload()
#     else:
#         parameters = model.parameters
#         word_to_id, char_to_id, tag_to_id = [
#             {v: k for k, v in x.items()}
#             for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
#         ]

# Lowercase sentence
    if parameters['lower']:
        words = [w.lower() for w in words]
    # Replace all digits with zeros
    if parameters['zeros']:
        words = [zero_digits(w) for w in words]
    words = [w if not w.isupper() else w.title() for w in words]

    # Prepare input
    sentence = prepare_sentence(words,
                                word_to_id,
                                char_to_id,
                                lower=parameters['lower'])
    input = create_input(sentence, parameters, False)

    # Decoding
    if parameters['crf']:
        y_preds = np.array(f_eval(*input))[1:-1]
    else:
        y_preds = f_eval(*input).argmax(axis=1)
    y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]

    # Output tags in the IOB2 format
    if parameters['tag_scheme'] == 'iobes':
        y_preds = iobes_iob(y_preds)

    # Write tags
    assert len(y_preds) == len(words)  # TODO:remove assert?

    ents = [{
        "start_char": b,
        "end_char": e,
        "label": label
    } for (b, e), label in zip(begin_end, y_preds) if label != "O"]

    return json.dumps({"ents": ents})