Python CRFTagger.tag примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.tag.crf

Класс/Тип: CRFTagger

Метод/Функция: tag

Примеров на hotexamples.com: 5

Python CRFTagger.tag - 5 примеров найдено. Это лучшие примеры Python кода для nltk.tag.crf.CRFTagger.tag, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CRFTagger(12)

train(8)

tag_sents(6)

set_model_file(5)

tag(5)

evaluate(2)

_feature_func(1)

_get_features(1)

Пример #1

Показать файл

Файл: SentenceTokenizer.py Проект: kmounlp/pyko

 def demo(self, test_sents):
     tagger = CRFTagger(feature_func=self.feature_detector)
     tagger.set_model_file(self.modelpath)
     for sent in test_sents:
         tagged = tagger.tag(untag(sent))
         for s in self._to_sentence(tagged):
             print(s)
     print(tagger.evaluate(test_sents))

Пример #2

Показать файл

Файл: SentenceTokenizer.py Проект: kmounlp/pyko

    def pyt_sent_tokenizer(self, paragraph):
        """단락을 문장으로 바꿔주는 함수입니다. 파이테스트용입니다.

        Args:
            paragraph(list(str)): 단락이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(list(str))): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        words = re.split('\s+', paragraph.strip())
        tagged = tagger.tag(words)
        return self._to_sentence(tagged)

Пример #3

Показать файл

Файл: SentenceTokenizer.py Проект: kmounlp/pyko

    def batch_sent_tokenizer(self, paragraphs):
        """단락들을 문장으로 바꿔주는 함수입니다.

        Args:
            paragraphs(list(str)): 단락들이 리스트 인자로 들어옵니다.

        Returns:
            sentences(list(str)): 단락을 문장단위로 잘라서 반환합니다.
        """
        tagger = CRFTagger(feature_func=self.feature_detector)
        tagger.set_model_file(self.modelpath)
        sentences = []
        for paragraph in paragraphs:
            words = re.split('\s', paragraph.strip())
            tagged = tagger.tag(words)
            sentences.append(self._to_sentence(tagged))
        return sentences

Пример #4

Показать файл

Файл: dic_hmm_crf_comparaison.py Проект: VieVie31/TAL_synonymy

y = np.array(y)
y_hat = np.array(y_hat)

print("hmm acc : ", (y == y_hat).mean())

#named entities recognition
import pickle

a = pickle.load(
    open(
        "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle",
        "rb"))

from nltk.tag.crf import CRFTagger

tagger = CRFTagger()
tagger.train(alldocs, u'crf.model'
             )  # donner en plus le fichier de stockage du calcul des features

tagger.tag(['Je suis à la maison'])
print(tagger._get_features([u"Je"], 0))

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train(alldocs)

# adT_seq: liste de liste de mots (=liste de phrase)
allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])]
                 for i in range(len(adT_seq))]
allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]]
                  for i in range(len(adT_seq))]

Пример #5

Показать файл

Файл: crf.py Проект: cynthiasuxy/Final_Project

def main(positive, death):
    ############# Compile the dataset ###############
    ## Load the dataset
    text = list()
    response = list()
    file_path = [positive, death]

    for path in file_path:
        input_file = jsonlines.open(path)
        for obj in input_file:
            text.append(obj['text'])
            response.append(obj['annotation']['part1.Response'])

    ## Tweet Preprocessing
    prep_text = list()
    for i in text:
        prep_text.append(p.clean(i))

    ## Tag Keywords and Create Labels
    ### Focus on verbs--therefore, try lemmatization first
    wnl = WordNetLemmatizer()
    n_corpus = len(prep_text)
    token_data = ["test"] * n_corpus

    n = 0
    for sent in prep_text:
        token_data[n] = [
            wnl.lemmatize(i, j[0].lower())
            if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
            for i, j in pos_tag(word_tokenize(sent))
        ]
        n = n + 1

    ### Create labels
    death_list = ["die", "dead", "death", "pass", "away"]

    n = 0
    for sent in token_data:
        for idx, token in enumerate(sent):
            if ((token.lower() in ["test", "positive", "result"])
                    and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "P-Yes"]
            elif ((token.lower() in ["test", "positive", "result"])
                  and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "P-No"]
            elif ((token.lower() in death_list) and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "D-Yes"]
            elif ((token.lower() in death_list) and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "D-No"]
            else:
                sent[idx] = [sent[idx], "Irr"]
        n = n + 1

    ## Shuffle and split into train data and dev data
    token_data = shuffle(token_data, random_state=6)
    train_data, dev_data = train_test_split(token_data,
                                            test_size=0.3,
                                            random_state=616)
    print(
        f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};"
    )

    ############# Fit A CRF Model And Predict ###############
    condition_to_func = {
        "base": my_features,
        "include_neighbors": neighbor_features
    }
    for cond, func in condition_to_func.items():
        # initialize
        crf = CRFTagger(feature_func=func)
        crf.train(train_data, 'model.tagger')
        # Test
        crf._feature_func(prep_text[0].split(), 7)
        crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']])

        # Output
        filename = cond + "_final_output.tsv"
        with open(filename, 'w') as pred_file:
            for sent in dev_data:
                sent_words = [item[0] for item in sent]
                gold_tags = [item[1] for item in sent]

                with_tags = crf.tag(sent_words)
                for i, output in enumerate(with_tags):
                    original_word, tag_prediction = output
                    line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n"
                    pred_file.write(line_as_str)
                # add an empty line after each sentence
                pred_file.write("\n")

    ############# Evaluation ###############
    ## Extract Data with Meaning Labels
    cond_list = ['base', 'include_neighbors']

    for cond in cond_list:
        filename = cond + "_final_output.tsv"

        with open(filename) as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            D_data = []
            P_data = []
            for row in rd:
                if len(row) > 1:
                    if row[1] in ['P-Yes', 'P-No']:
                        P_data.append(row)
                    elif row[1] in ['D-Yes', 'D-No']:
                        D_data.append(row)

        column_name = ['token', 'label', 'prediction']
        P_df = pd.DataFrame(P_data, columns=column_name)
        D_df = pd.DataFrame(D_data, columns=column_name)
        Total_df = P_df.append(D_df)

        # Accuracy
        ## Overall Accuracy
        T_a = accuracy_score(Total_df['label'], Total_df['prediction'])

        ## Accuracy, Precision, and Recall for two events
        accuracy = []
        precision = []
        recall = []
        for df in [P_df, D_df]:
            accuracy.append(accuracy_score(df['label'], df['prediction']))
            precision.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['prediction'][item])))
            recall.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item])))

        ## F-1
        f1 = []
        for num in [0, 1]:
            f1.append((2 * precision[num] * recall[num]) /
                      (precision[num] + recall[num]))

        # Report performance
        print("condition: " + cond)
        print(f"Overall Accuracy {T_a:0.03}")
        covid_event = ['Test Positive', 'Death Case']

        num = 0
        for event in covid_event:
            print(
                f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}"
            )
            num = num + 1

    ## Basicline Performance / Confusion Matrix
    print("Confusion Matrix:")
    print(pd.crosstab(Total_df['label'], Total_df['prediction']))
    print("Training data:")
    labels = ["P-Yes", "P-No", "D-Yes", "D-No"]
    for label in labels:
        train_data2 = np.concatenate(train_data).flat
        n_label = sum(1 for item in train_data2 if item == label)
        print(f"Number of {label}: {n_label}")