Exemplo n.º 1
0
def train_and_score(featureset,
                    training_samples,
                    test_samples,
                    other_samples=None):
    """This function takes in training and test data and a featureset and will return
    the scores for testing and training data
    Optional extra set of samples can be sent to be tested against as well
    """
    chunker = NamedEntityChunker(training_samples, featureset)

    testscore = chunker.evaluate([
        conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance])
        for sentance in test_samples
    ]).accuracy()
    trainscore = chunker.evaluate([
        conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance])
        for sentance in training_samples
    ]).accuracy()
    if other_samples:
        mixscore = chunker.evaluate([
            conlltags2tree([(w, pos, tag) for (w, pos), tag in sentance])
            for sentance in other_samples
        ]).accuracy()
        return (testscore, trainscore, mixscore)
    else:
        return (testscore, trainscore)
Exemplo n.º 2
0
def processLanguage():
    try:
        for item in contentArray:
            tokenized = nltk.word_tokenize(item)

            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(item)
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            filtered_sentence = []
            for w in word_tokens:
                if w not in stop_words:
                    filtered_sentence.append(w)

            #print(word_tokens)
            print(filtered_sentence)

            tagged = nltk.pos_tag(filtered_sentence)
            print(tagged)

            ne_tree = ne_chunk(tagged)
            print(ne_tree)

            iob_tagged = tree2conlltags(ne_tree)
            print(iob_tagged)
            ne_tree = conlltags2tree(iob_tagged)
            print(ne_tree)

            namedEnt = nltk.ne_chunk(tagged)
            namedEnt.draw()
            #break
            #continue
            #time.sleep(1)

    except Exception as e:
        print(str(e))
Exemplo n.º 3
0
def stanford_tree(bio_tagged):
    tokens, ne_tags = zip(*bio_tagged)
    pos_tags = [pos for token, pos in pos_tag(tokens)]

    conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
    ne_tree = conlltags2tree(conlltags)
    return ne_tree
Exemplo n.º 4
0
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]
    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree
Exemplo n.º 5
0
def get_result_json(result_list):
    tokens = [result['word'] for result in result_list]
    tags = [result['tag'] for result in result_list]

    re_dict_json = defaultdict(int)
    result_json = defaultdict(list)
    re_dict_json = defaultdict(int)

    pos_tags = [pos for token, pos in pos_tag(tokens)]
    conlltags = [(token, pos, tg)
                 for token, pos, tg in zip(tokens, pos_tags, tags)]
    ne_tree = conlltags2tree(conlltags)
    original_text = defaultdict(list)
    for subtree in ne_tree:
        original_string = []
        if type(subtree) == Tree:
            original_label = subtree.label()
            leaves = subtree.leaves()
        else:
            leaves = [subtree]
        for token, pos in leaves:
            token = token.replace('##', '')
            re_dict_json[re_find_which_pattern(token)] += 1
            original_string.extend([
                (token, int(re_dict_json[re_find_which_pattern(token)]))
            ])
        if original_string:
            try:
                original_text[original_label.lower()].append(original_string)
            except:
                pass
    return original_text
Exemplo n.º 6
0
Arquivo: reader.py Projeto: vunb/eva
    def read(self):
        self.iob_sents = []
        self.sents = []
        self.feature_set = []
        self.train_set = []
        self.test_set = []
        self.iob_train = []
        self.iob_test = []
        for filename in glob(self.dirname):
            file_feature_set = []
            file_iob_sents = []
            with open(filename, 'rb') as f:
                tags_re = r'\[(.*?)\]'
                tags_sub = r'\[[A-Z]+\s|\]'
                for sentence in f:
                    sentence = sentence.decode('utf8')
                    text = re.sub(tags_sub, '', sentence).strip('\n').strip()
                    self.sents.append(text)
                    file_feature_set.append(
                        (text, splitext(basename(filename))[0]))

                    pos_tags = pos_tag(text)[0]
                    tags = []
                    for tag in re.findall(tags_re, sentence):
                        tag, value = tag.split(' ', 1)
                        words = word_tokenize(value)
                        first = [(words[0], self.pop_pos(pos_tags, words[0]),
                                  'B-%s' % tag)]
                        tags.append(first + [(w, self.pop_pos(pos_tags, w),
                                              'I-%s' % tag)
                                             for w in words[1:]])
                    itags = iter(tags)
                    text_list = re.sub(tags_re, '[NE]', sentence).split('[NE]')
                    iob = []
                    for part in text_list:
                        tagged_part = [(w, self.pop_pos(pos_tags, w), 'O')
                                       for w in word_tokenize(part)]
                        try:
                            ne = next(itags)
                            iob += tagged_part + ne
                        except StopIteration:
                            iob += tagged_part
                    file_iob_sents.append(iob)
            self.feature_set.extend(file_feature_set)
            self.iob_sents.extend(file_iob_sents)
            file_iob_train, file_iob_test = train_test_split(
                [[((w, p), i) for w, p, i in s] for s in file_iob_sents],
                test_size=self.test_size,
                random_state=self.random_state)
            file_train_set, file_test_set = train_test_split(
                file_feature_set,
                test_size=self.test_size,
                random_state=self.random_state)
            self.iob_train.extend(file_iob_train)
            self.iob_test.extend(file_iob_test)
            self.train_set.extend(file_train_set)
            self.test_set.extend(file_test_set)

        self.chunked_sents = [conlltags2tree(x) for x in self.iob_sents]
Exemplo n.º 7
0
def nltk_ner(x, *args):
    from nltk import word_tokenize, pos_tag, ne_chunk
    from nltk.chunk import conlltags2tree, tree2conlltags

    ne_tree = ne_chunk(pos_tag(word_tokenize(x)))
    iob_tagged = tree2conlltags(ne_tree)
    ne_tree = conlltags2tree(iob_tagged)
    return ne_tree
 def parse(self, sentence):
     pos_tags = [pos for word, pos in sentence]
     tagged_pos_tags = self.tagger.tag(pos_tags)
     conlltags = [(word, pos_tag, chunk_tag)
                  for ((word, pos_tag),
                       (pos_tag,
                        chunk_tag)) in zip(sentence, tagged_pos_tags)]
     return conlltags2tree(conlltags)
Exemplo n.º 9
0
 def __create_tree__(self, tokens, key):
     _input = self.__get_folia_doc__(tokens)
     __output = self._frog.process(_input)
     for token in __output:
         token['pos'] = token['pos'].split('(')[0]
         if token['pos'].startswith('SPEC'):
             token['pos'] = 'NNP'
     return conlltags2tree([(token['text'], token['pos'], token[key]) for token in __output ])
Exemplo n.º 10
0
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree
Exemplo n.º 11
0
def load_iobtags(iobtags):
    if iobtags is None:
        return None
    try:
        iobtags = [make_tuple(i.strip()) if i.endswith(')') else make_tuple(i.strip()+")") for i in iobtags[1:-1].split("),")]
        return conlltags2tree(iobtags)
    except:
        return None
Exemplo n.º 12
0
def stanford_ner_to_tree(text): 
    bio_tagged = stanford_ner_to_bio(stanford_ner(text))
    sentence_tokens, sentence_ne_tags = zip(*bio_tagged)
    sentence_pos_tags = [pos for token, pos in pos_tag(sentence_tokens)]

    sentence_conlltags = [(token, pos, ne) 
        for token, pos, ne in zip(sentence_tokens, sentence_pos_tags, sentence_ne_tags)]

    return conlltags2tree(sentence_conlltags)
Exemplo n.º 13
0
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the normalized format of triplets [(w1, t1, iob1), ...]
        iob_triplets = [(word, token, chunk) for ((word, token), chunk) in chunks]

        # Transformthe list of triplets to NLTK tree format
        return conlltags2tree(iob_triplets)
Exemplo n.º 14
0
    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        tagged_sents = conlltags2tree(tagged_sents)

        # Nested chunked tags for CLAUSING

        # cp = nltk.RegexpParser(grammar)
        # tagged_sents = cp.parse(tagged_sents)
        return tagged_sents
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
Exemplo n.º 16
0
def get_chuncker_accuracy(chunker, test_samples):
    """
    returns score of the chunker against the gold standard
    """
    score = chunker.evaluate([
        conlltags2tree([(w, t, iob) for (w, t), iob in iobs])
        for iobs in test_samples
        ])
    return score.accuracy()
Exemplo n.º 17
0
 def iob_tagged_ner(sentence):
     """
     from nltk import word_tokenize, pos_tag, ne_chunk
     from nltk.chunk import conlltags2tree, tree2conlltags
     """
     ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
     iob_tagged = tree2conlltags(ne_tree)
     iob_tagged_ne_tree = conlltags2tree(iob_tagged)
     return iob_tagged_ne_tree
Exemplo n.º 18
0
def get_tags(tokens):
    # tokenized = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)

    namedEnt = nltk.ne_chunk(tagged)
    iob_tagged = tree2conlltags(namedEnt)

    ne_tree = conlltags2tree(iob_tagged)

    return ne_tree
Exemplo n.º 19
0
    def parse(self, tagged_sent, return_tree = True):
        chunks = self.tagger.tag(tagged_sent)

 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = revert_scheme([(w, t, c) for ((w, t), c) in chunks])
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets) if return_tree else iob_triplets
Exemplo n.º 20
0
 def parse(self, tokens):
     _input = self.__get_folia_doc__(tokens)
     __output = self._frog.process(_input)
     for token in __output:
         token['pos'] = token['pos'].split('(')[0]
         if token['pos'].startswith('SPEC'):
             token['pos'] = 'NNP'
         if token['chunker'] != 'O' and token['ner'] == 'O':
             token['ner'] = token['chunker']
     return conlltags2tree([(token['text'], token['pos'], token['ner']) for token in __output ])
Exemplo n.º 21
0
 def transform_stanford_name_entity_to_tree(ne_tagged_sent):
     ne_tree = []
     if ne_tagged_sent:
         bio_tagged_sent = Helper.transform_stanford_name_entity_to_bio(
             ne_tagged_sent)
         sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
         sent_pos_tags = [pos for token, pos in nltk.pos_tag(sent_tokens)]
         sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(
             sent_tokens, sent_pos_tags, sent_ne_tags)]
         ne_tree = conlltags2tree(sent_conlltags)
     return ne_tree
Exemplo n.º 22
0
    def make_ne_tree(self, tagged):
        bio_tagged_sent = self.stanford_reformat(tagged)
        sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
        sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

        sent_conlltags = [
            (token, pos, ne)
            for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)
        ]
        ne_tree = conlltags2tree(sent_conlltags)
        return ne_tree
Exemplo n.º 23
0
    def parse(self, tagged_sent):
        """This function is used by evaluate to make guesses and format the guesses
        """
        #make gueess (tag)
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, p, t) for ((w, p), t) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
 
        # Get the Chunk tags
        tagged_pos_tags = self.tagger.tag(pos_tags)
 
        # Assemble the (word, pos, chunk) triplets
        conlltags = [(word, pos_tag, chunk_tag) 
                     for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)]
 
        # Transform to tree
        return conlltags2tree(conlltags)
Exemplo n.º 25
0
    def stanfordNE2tree(
        self, ne_tagged_sent
    ):  # this function is a place holder for parsed tree output
        bio_tagged_sent = self.stanfordNE2BIO(ne_tagged_sent)
        sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
        sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

        sent_conlltags = [
            (token, pos, ne)
            for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)
        ]
        ne_tree = conlltags2tree(sent_conlltags)
        return ne_tree
    def parse(self, orig_tokens):
        if orig_tokens and type(orig_tokens[0]) is tuple:
            tokens = [token for token, _ in orig_tokens]
        else:
            tokens = orig_tokens

        tokenized_ud = list(
            map(lambda x: (x[0], map_tag('ru-rnc', 'universal', x[1])),
                pos_tag(tokens, lang='rus')))
        tokenized_nltk = pos_tag(tokens, lang='rus')
        tokenized_mystem = [(token, self.mystem_tagger.tag_word(token)[0][1])
                            for token in tokens]

        # print(self.chunker_iis.parse(tokenized_ud))

        tags_nltk = self.chunker_nltk.parse(tokenized_nltk, return_tree=False)
        tags_ud = self.chunker_nltk.parse(tokenized_ud, return_tree=False)
        tags_mystem = self.chunker_nltk.parse(tokenized_mystem,
                                              return_tree=False)
        tags_iis = tree2conlltags(self.chunker_iis.parse(tokenized_ud))
        tags_grammar = tree2conlltags(
            self.grammar_chunker.parse(tokenized_mystem))

        result_tags = [tags_nltk, tags_ud, tags_mystem, tags_grammar, tags_iis]

        if tokens is orig_tokens:
            tag_source = tags_ud
        else:
            tag_source = orig_tokens

        tags = [(token, tag_source[ind][1],
                 pick_tag([tags_sp[ind][2]
                           for tags_sp in result_tags], tags_ud[ind][1]))
                for ind, token in enumerate(tokens)]

        # for ind, (token,pos,iob_tag) in enumerate(tags):
        #     if token in set(['таких', 'такие', 'такими', 'как', 'включая', 'и', 'или','другие', 'других', 'другими', 'особенно', 'в', 'частности', ',']):
        #         tags[ind] = (token, pos, 'O')

        for ind, (token, pos, iob_tag) in enumerate(tags):
            if ind == 0:
                continue
            if iob_tag == "B-NP*":
                if tags[ind - 1][2] in {'B-NP', 'I-NP'}:
                    tags[ind] = (token, pos, 'I-NP')
                else:
                    tags[ind] = (token, pos, 'B-NP')
            if iob_tag == "I-NP" and tags[ind - 1][2] not in {'B-NP', 'I-NP'}:
                tags[ind] = (token, pos, 'B-NP')

        return conlltags2tree(tags)
Exemplo n.º 27
0
    def __generate_tree(self, bio_tagged):
        """
        Tranform a list of tags in a tree
        """
        from nltk import pos_tag
        from nltk.chunk import conlltags2tree


        tokens, ne_tags = zip(*bio_tagged)
        pos_tags = [pos for token, pos in pos_tag(tokens)]

        conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
        ne_tree = conlltags2tree(conlltags)
        return ne_tree
Exemplo n.º 28
0
def stanford_ner(words, args):
    start = time.time()
    """
    3 class: Location, Person, Organization
    4 class: Location, Person, Organization, Misc
    7 class: Location, Person, Organization, Money, Percent, Date, Time
    """
    ner_classifier_path = 'english.all.3class.distsim.crf.ser.gz'  # default 3 class

    if args.ner_class == 7:
        ner_classifier_path = 'english.muc.7class.distsim.crf.ser.gz'
    elif args.ner_class == 4:
        ner_classifier_path = 'english.conll.4class.distsim.crf.ser.gz'

    ner_classifier_full_path = os.path.join(stanford_ner_directory_path,
                                            'classifiers', ner_classifier_path)
    ner_jar_path = os.path.join(stanford_ner_directory_path,
                                'stanford-ner.jar')
    s_ner_tagger = StanfordNERTagger(ner_classifier_full_path,
                                     ner_jar_path,
                                     encoding='UTF-8')
    _tagged = s_ner_tagger.tag(words)

    # NLP BIO tags processing (B-beginning NE, I-inside NE, O-outside NE)
    bio_tagged = []
    prev_tag = "O"
    for token, tag in _tagged:
        if tag == "O":  # O
            bio_tagged.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O":  # Begin NE
            bio_tagged.append((token, "B-" + tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag:  # Inside NE
            bio_tagged.append((token, "I-" + tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
            bio_tagged.append((token, "B-" + tag))
            prev_tag = tag

    # convert bio_tags to NLTK tree-like format
    tokens, ne_tags = zip(*bio_tagged)
    pos_tags = [pos for token, pos in get_pos_tags(tokens, args)]
    conlltags = [(token, pos, ne)
                 for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
    ne_tree = conlltags2tree(conlltags)

    print 'Stanford NER took %.3f sec, NEs are:\n %s\n' % (
        time.time() - start, structure_ne(ne_tree))
Exemplo n.º 29
0
    def evaluate_chunker(chunker, test_samples):
        accuracy = 0
        with open(test_samples, 'rb') as fp:

            dataset = pickle.load(fp)
            for i in range(len(dataset)):

                score = chunker.evaluate([
                    conlltags2tree([(w, t, iob)
                                    for ((w, t), iob) in dataset[i]])
                ])
                accuracy = accuracy + score.accuracy()

        return accuracy / len(dataset)
Exemplo n.º 30
0
def NER_nltk(sentence):

    from nltk import word_tokenize, pos_tag, ne_chunk
    from nltk.chunk import conlltags2tree, tree2conlltags

    ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
    print(ne_tree)

    print("-----------------------------")

    iob_tagged = tree2conlltags(ne_tree)
    print(iob_tagged)

    print("-----------------------------")

    ne_tree = conlltags2tree(iob_tagged)
    print(ne_tree)
Exemplo n.º 31
0
def convertIOBtag(tokens, tags):
    # tag each token with pos
    pos_tags = [pos for token, pos in pos_tag(tokens)]
    # convert the BIO / IOB tags to tree
    conlltags = [(token, pos, tg)
                 for token, pos, tg in zip(tokens, pos_tags, tags)]
    ne_tree = conlltags2tree(conlltags)
    # parse the tree to get our original text
    original_text = []
    for subtree in ne_tree:
        # checking for 'O' tags
        if type(subtree) == Tree:
            original_label = subtree.label()
            original_string = " ".join(
                [token for token, pos in subtree.leaves()])
            original_text.append((original_string, original_label))
    return original_text
def stanfordNE2tree(ne_tagged_sent):
    """
    Function converts the Named Entity tagged sentence to a tree
    Parameters
    ----------
    ne_tagged_sent : list
        Named entity tagged sentence by Standford NER tagger

    Returns
    -------
    Tree
        NLTK tree structure of CoNLL IOB

    """
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree
Exemplo n.º 33
0
 def parse(self, sent):
     posTags = [posTag for (word, posTag) in sent]
     bioTags = [bioTag for (posTag, bioTag) in self.tagger.tag(posTags)]
     chunkedSent = [(word, posTag, bioTag) for ((word, posTag), bioTag) in zip(sent, bioTags)]
     return conlltags2tree(chunkedSent)
Exemplo n.º 34
0
	def tagged_parse_sents(self, sentences):
		return conlltags2tree(super(Chunker, self).tag_sents(sentences))
Exemplo n.º 35
0
	def parse(self, tagged_sent):
		if not tagged_sent: return None
		chunks = self.tagger.tag(tagged_sent)
		return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
Exemplo n.º 36
0
	def parse_sents(self, sentences):
		for conlltagged in super(Chunker, self).tag_sents(sentences):
			yield conlltags2tree(conlltagged)