Exemplo n.º 1
0
    def convert(self, parsed, stanford=False, ispos=False, csv=False, csv_line_id=0, iobtree=False):
        if csv:
            conll_format = 'Sentence: ' + str(csv_line_id)
        elif iobtree:
            conll_format = None
        else:
            conll_format = ''

        conll = []
        for entity, value in parsed.items():
            value_split = value.split()
            value_split_len = len(value_split)

            if value != '':
                if ispos:
                    value_pos = pos_tag(value_split)  # pos = part of speech
                    conll_format += self.__get_conll_pos(value_pos, value_split_len, entity)
                elif csv:
                    value_pos = pos_tag(value_split)
                    conll_format += self.__get_csv(value_pos, value_split_len, entity)
                elif iobtree:
                    value_pos = pos_tag(value_split)
                    iob = self.__get_nltk_tree(value_pos, value_split_len, entity)
                    conll = conll + iob
                else:
                    conll_format += self.__get_conll_format(value_split, value_split_len, entity, stanford)

        if csv is False and iobtree is False:
            conll_format += '\n'

        if iobtree:
            conll_format = conlltags2tree(conll)

        return conll_format
Exemplo n.º 2
0
def Ext_Chunks(sents):
    NP_li = []
    # print(sents)
    grammar_exp = r"""
      CHUNK: {<NN><NN.*><NN.*>+}   # chunk determiner/possessive, adjectives and noun
             }<NNP>+{              # chunk sequences of proper nouns
    """
    # cp = nltk.RegexpParser('CHUNK:  {<NN><NN.*><NN.*>+}}<NNP>{')
    cp = nltk.RegexpParser(grammar_exp)
    # cp = nltk.RegexpParser('CHUNK:  {<DT>?<JJ.*>*<NN.*>+}')

    for sent in sents:
        tree = cp.parse(sent)
        # print(tree.draw())
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                print(subtree)
                iob_tags = tree2conlltags(subtree)
                iob_tree = conlltags2tree(iob_tags)
                print(iob_tags)
                print(iob_tree)
                chunk_words = str(subtree).replace('/DT', '').replace('/JJS', '').replace('/JJ', '').replace('/NNS',
                                                                                                             '').replace(
                    '/NNP', '').replace('(CHUNK', '').replace(')', '').replace('/NN', '').replace('\n', '')
                NP_li.append(chunk_words)
                print(chunk_words, '\n')
    print('----------------------------------------------------------------\n',NP_li)
    return NP_li
Exemplo n.º 3
0
def read_gmb_ner(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [
                            seq for seq in annotated_sentence.split('\n')
                            if seq
                        ]

                        standard_form_tokens = []

                        for idx, annotated_token in enumerate(
                                annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[
                                1], annotations[3]

                            if ner != 'O':
                                ner = ner.split('-')[0]

                            standard_form_tokens.append((word, tag, ner))

                        conll_tokens = to_conll_iob(standard_form_tokens)
                        yield conlltags2tree(conll_tokens)
Exemplo n.º 4
0
    def parse(self, tokens):
        """
		Chunk a tagged sentence
		:param tokens: List of words [(w1, t1), (w2, t2), ...]
		:return: chunked sentence: nltk.Tree
		"""

        if isinstance(tokens, str):
            tokens = pos_tag(word_tokenize(tokens))

        history = []
        iob_tagged_tokens = []
        for index, (word, tag) in enumerate(tokens):
            iob_tag = self._classifier.predict(
                [self._feature_detector(tokens, index, history)])[0]
            history.append(iob_tag)
            iob_tagged_tokens.append((word, tag, iob_tag))

        results = conlltags2tree(iob_tagged_tokens)
        return_val = []
        for i in results.subtrees():
            if i.label() != 'S':

                word = ''
                for x in i.leaves():
                    word += x[0] + ' '

                word = word.strip()
                return_val.append({"type": i.label(), "value": word})
        return return_val
Exemplo n.º 5
0
def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    # file_handle = zipfile.ZipFile('gmb-2.2.0.zip', 'r')
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [
                            seq for seq in annotated_sentence.split('\n')
                            if seq
                        ]
                        standard_form_tokens = []
                        for idx, annotated_token in enumerate(
                                annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[
                                1], annotations[3]
                            ner_tags[ner] += 1
                            # Get only the primary category
                            if ner != 'O':
                                ner = ner.split('-')[0]

                            # if tag in ('LQU', 'RQU'):
                            #     tag = "``"

                            standard_form_tokens.append((word, tag, ner))
                        conll_tokens = to_conll_iob(standard_form_tokens)

                        yield conlltags2tree(conll_tokens)
    print("Data read done")
Exemplo n.º 6
0
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        iob_triplets = tagged_pairs2triplets(chunks)

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
 def parse(self, tokens):
     history = []
     iob_tagged_tokens = []
     for index, (word, tag) in enumerate(tokens):
         iob_tag = self._classifier.predict([self._feature_detector(tokens, index, history)])[0]
         history.append(iob_tag)
         iob_tagged_tokens.append(word, tag, iob_tag)
     return conlltags2tree(iob_tagged_tokens)
Exemplo n.º 8
0
def stanford_tree(bio_tagged):
    tokens, ne_tags = zip(*bio_tagged)
    pos_tags = [pos for token, pos in pos_tag(tokens)]

    conlltags = [(token, pos, ne)
                 for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
    ne_tree = conlltags2tree(conlltags)
    return ne_tree
Exemplo n.º 9
0
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
Exemplo n.º 10
0
def ner_eval(chunker, test_samples):
    """
    evaluate named entity recognition accuracy
    :return:
    """
    score = chunker.evaluate([
        nltk.conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
        for iobs in test_samples[:500]
    ])
    return score.accuracy()
Exemplo n.º 11
0
def read_gmb_ner(corpus_root, start_index=None, end_index=None):
    current_file = -1
    for root, _, files in os.walk(corpus_root):
        for filename in files:
            # Skip other files
            if not filename.endswith(".tags"):
                continue

            current_file += 1
            # Skip files until we get to the start_index
            if start_index is not None and current_file < start_index:
                continue

            # Stop reading after end_index
            if end_index is not None and current_file > end_index:
                return

            with open(os.path.join(root, filename), 'rb') as file_handle:
                # Read the entire file
                file_content = file_handle.read().decode('utf-8').strip()

                # Split into sentences
                annotated_sentences = file_content.split('\n\n')

                for annotated_sentence in annotated_sentences:
                    # Split into annotated tokens
                    rows = [
                        row for row in annotated_sentence.split('\n') if row
                    ]

                    ner_triplets = []
                    for row in rows:
                        annotations = row.split('\t')
                        word, tag, ner = annotations[0], annotations[
                            1], annotations[3]

                        # Get only the main tag
                        if ner != 'O':
                            ner = ner.split('-')[0]

                        # Make these tags NLTK compatible
                        if tag in ('LQU', 'RQU'):
                            tag = "``"

                        # Ignore the art,eve,nat tags because they are underrepresented
                        if tag in ('art', 'eve', 'nat'):
                            tag = 'O'

                        ner_triplets.append((word, tag, ner))

                    iob_triplets = ner2conlliob(ner_triplets)

                    # Yield a nltk.Tree
                    yield conlltags2tree(iob_triplets)
    print("Total files=", current_file)
Exemplo n.º 12
0
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [
        (token, pos, ne)
        for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)
    ]
    ne_tree = nltk.conlltags2tree(sent_conlltags)
    return ne_tree
Exemplo n.º 13
0
    def parse(self, tokens):
        """
    Chunk a tagged sentence
    tokens = list of words [(w1,t1) ...]
    return: chunked sentence in IOB format
    """
        history = []
        iob_tagged_tokens = []
        for index, (word, tag) in enumerate(tokens):
            iob_tag = self._classifier.predict(
                [self._feature_detector(tokens, index, history)])[0]
            history.append(iob_tag)
            iob_tagged_tokens.append((word, tag, iob_tag))

        return conlltags2tree(iob_tagged_tokens)
    def parse(self, tokens):
        """
        Chunk a tagged sentence
        :param tokens: List of words [(w1, t1), (w2, t2), ...]
        :return: chunked sentence: nltk.Tree
        """
        history = []
        iob_tagged_tokens = []
        for index, (word, tag) in enumerate(tokens):
            iob_tag = self._classifier.predict(
                [self._feature_detector(tokens, index, history)])[0]
            history.append(iob_tag)
            iob_tagged_tokens.append((word, tag, iob_tag))

        return conlltags2tree(iob_tagged_tokens)
def predict_IOB_labels(s):
    # generate query features for sentence s
    query_pos_tags, query_features = process_user_query(s)
    predicted_labels = crf.predict([query_features])[0]

    # convert the predicted labels into standard (token, pos, label) format
    query_tag_list = [
        (pos_tag[0], pos_tag[1], label)
        for pos_tag, label in list(zip(query_pos_tags, predicted_labels))
    ]

    # convert into tree
    query_tree = conlltags2tree(query_tag_list)

    # traverse the tree and print labels of subtrees
    labels_dict = {}
    for n in query_tree:
        if isinstance(n, nltk.tree.Tree):
            label = n.label()
            leaves = ' '.join(i[0] for i in n.leaves())
            labels_dict[label] = leaves
    return labels_dict
Exemplo n.º 16
0
def mark_entities(tagged_sentence, entity_words, label):
    """
    tagged_sentence: [('Word', 'Tag'), ...]
    entity_words: ['This', 'is', 'an', 'entity']
    label: the entity type

    return a nltk.Tree instance with the entities wrapped in chunks
    """

    iob_tagged = [(w, t, 'O') for w, t in tagged_sentence]

    words = nltk.untag(tagged_sentence)
    start_index = sub_list(words, entity_words)
    if start_index is not None:
        iob_tagged[start_index] = (iob_tagged[start_index][0],
                                   iob_tagged[start_index][1], 'B-' + label)
        for idx in range(1, len(entity_words)):
            iob_tagged[start_index + idx] = (iob_tagged[start_index + idx][0],
                                             iob_tagged[start_index + idx][1],
                                             'I-' + label)

    return nltk.conlltags2tree(iob_tagged)
Exemplo n.º 17
0
def convert_sentprocessed_to_tree(sent_processed):
    sent_tree = []
    for sent in sent_processed:
        sent_tree.append(nltk.conlltags2tree(sent))
    return sent_tree
Exemplo n.º 18
0
 def parse(self, sent):
     tagged_sents = self.tagger.tag(sent)
     iob_sents = [(w, t, c) for ((w, t), c) in tagged_sents]
     return (conlltags2tree(iob_sents))
Exemplo n.º 19
0
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)


reader = read_gmb(corpus_root, 1000)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]

print "#training samples = %s" % len(
    training_samples)  # training samples = 55809
print "#test samples = %s" % len(test_samples)  # test samples = 6201

chunker = NamedEntityChunker(training_samples[:5000])
ner = chunker.parse(
    pos_tag(
        word_tokenize(
            " Jobs was diagnosed "
            "with a pancreatic neuroendocrine "
            "tumor in 2003 and died on October "
            "5, 2011, of respiratory arrest related to the tumor. ")))
#ner.draw()
#flat_ner=ner.flatten()
#print (flat_ner)
#print (type(flat_ner))
score = chunker.evaluate([
    conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
    for iobs in test_samples[:500]
])
print score
Exemplo n.º 20
0
print(pos_tagger)
   
grammar = "NP: {<DT>?<JJ>*<NN>}"

cp = nltk.RegexpParser(grammar)

result = cp.parse(pos_tagger)

result = result.flatten()

#print(result)

iob_tags = tree2conlltags(pos_tagger)

tree = conlltags2tree(iob_tags)
#print(tree)

''' Information retreival using spacy '''

import spacy
import en_core_web_sm
from collections import Counter
from spacy import displacy
from collections import defaultdict
from tabulate import tabulate
nlp = spacy.load('en_core_web_sm')

doc = nlp(u'ABILIFY is indicated for the treatment of schizophrenia in adults and in adolescents aged 15 years and older without any history of myocardial infractions.')

displacy.serve(doc, style='dep')