Пример #1
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag(
         'What is the airspeed of an unladen swallow ?'.split())
     expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
                 ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
                 ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]
     self.assertEqual(result, expected)
Пример #2
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
     expected = [
         ("What", "WP"),
         ("is", "VBZ"),
         ("the", "DT"),
         ("airspeed", "NN"),
         ("of", "IN"),
         ("an", "DT"),
         ("unladen", "NN"),
         ("swallow", "NN"),
         ("?", "."),
     ]
     self.assertEqual(result, expected)
Пример #3
0
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50
Пример #4
0
 def __init__(self,
             path='senna',
             **kwargs):   
     
     self.__dict__.update(kwargs)
     
     if not os.path.isabs(path):
         current_dir = os.path.dirname(os.path.abspath(__file__))
         path = os.path.join(current_dir, path)
     
     paths = (
             path,
             os.path.join(sys.exec_prefix, r'lib\site-packages', 'senna'),
             os.path.join(MODULEDIR, 'bin', 'senna')
     )
     
     for path in paths:
         if os.path.exists(path):
            break
     else:
         raise FileNotFoundError(paths) 
      
     self.tagger = SennaTagger(path, **kwargs)
Пример #5
0
    'put': ["perform", "mark", "evaluate", "update", "set", "change", "edit"],
    'delete': ["delete", "destroy", "kill", "remove", "cancel"]
}

# response code lists
L404 = [
    'not found', 'doesn\'t exist', 'does not exist', 'unable to find',
    'can\'t find'
]
L401 = ['unauthorized', 'not allowed', 'rejected', 'denied']
L400 = ['failed', 'unsuccessful']

# st = StanfordPOSTagger("C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger",
#                "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/stanford-postagger-full-2015-12-09/stanford-postagger.jar")

senna_tagger = SennaTagger(
    "C:/Users/Tasos/OneDriveThesis/Thesis/src/lib/senna")
p = inflect.engine()


def resource_analysis(resources, resource_names):
    model = {}
    hateoas_graph = {}
    for resource, scenarios in resources.items():
        hateoas_graph[resource] = []
        model[resource] = {
            'get': {
                'request_params': [],
                'response': {
                    'params': [],
                    'links': []
                }
Пример #6
0
#Imports
import os, nltk
import re
from nltk.tag import SennaTagger, SennaChunkTagger
from nltk.tokenize import sent_tokenize

#Constants
SOURCE_DIR = '../data/annotated/'
SENNA_INPUT_DIR_RESPS = '../data/senna_input_resps/'
SENNA_INPUT_DIR_SENTS = '../data/senna_input_sents/'
SENNA_DEST_DIR = '../data/senna_wordlist/'
SENNA_EXECUTABLE_DIR = '../../tools/senna'
"""
for now these taggers are not used. SENNA tagging is done mannually using a shell script
"""
pos_tagger = SennaTagger(SENNA_EXECUTABLE_DIR)
chunk_tagger = SennaChunkTagger(SENNA_EXECUTABLE_DIR)


def add_space_between_sentences(text):
    """
    Add space between sentences where no space is added after period
    """
    space_added_txt = re.sub(r"(\w+)\.(\w+)", r"\1. \2", text)
    return space_added_txt


def add_space_between_sentence_and_period(text, text_type):
    """
    Add space between sentence and period.
    This is needed for SENNA to tokenize sentences.
Пример #7
0
def filter_task(f, whitelist_dict, foutpath, key_name):

    # pretrain = HunposTagger('hunpos.model', 'hunpos-1.0-linux/hunpos-tag')
    pretrain = SennaTagger('senna')
    """
    Uses: namecheck() to check if word that has been tagged as name by either nltk or spacy. namecheck() first searches
    nameset which is generated by checking words at the sentence level and tagging names. If word is not in nameset,
    namecheck() uses spacy.nlp() to check if word is likely to be a name at the word level. 

    """
    with open(f, encoding='utf-8', errors='ignore') as fin:
        # define intial variables
        head, tail = os.path.split(f)
        #f_name = re.findall(r'[\w\d]+', tail)[0]  # get the file number
        print(tail)
        start_time_single = time.time()
        total_records = 1
        phi_containing_records = 0
        safe = True
        screened_words = []
        name_set = set()
        phi_reduced = ''
        '''
        address_indictor = ['street', 'avenue', 'road', 'boulevard',
                            'drive', 'trail', 'way', 'lane', 'ave',
                            'blvd', 'st', 'rd', 'trl', 'wy', 'ln',
                            'court', 'ct', 'place', 'plc', 'terrace', 'ter']
                            '''
        address_indictor = [
            'street', 'avenue', 'road', 'boulevard', 'drive', 'trail', 'way',
            'lane', 'ave', 'blvd', 'st', 'rd', 'trl', 'wy', 'ln', 'court',
            'ct', 'place', 'plc', 'terrace', 'ter', 'highway', 'freeway',
            'autoroute', 'autobahn', 'expressway', 'autostrasse', 'autostrada',
            'byway', 'auto-estrada', 'motorway', 'avenue', 'boulevard', 'road',
            'street', 'alley', 'bay', 'drive', 'gardens', 'gate', 'grove',
            'heights', 'highlands', 'lane', 'mews', 'pathway', 'terrace',
            'trail', 'vale', 'view', 'walk', 'way', 'close', 'court', 'place',
            'cove', 'circle', 'crescent', 'square', 'loop', 'hill', 'causeway',
            'canyon', 'parkway', 'esplanade', 'approach', 'parade', 'park',
            'plaza', 'promenade', 'quay', 'bypass'
        ]

        note = fin.read()
        note = re.sub(r'=', ' = ', note)
        # Begin Step 1: saluation check
        re_list = pattern_salutation.findall(note)
        for i in re_list:
            name_set = name_set | set(i[1].split(' '))

        # note_length = len(word_tokenize(note))
        # Begin step 2: split document into sentences
        note = sent_tokenize(note)

        for sent in note:  # Begin Step 3: Pattern checking
            # postal code check
            # print(sent)
            if pattern_postal.findall(sent) != []:
                safe = False
                for item in pattern_postal.findall(sent):
                    screened_words.append(item[0])
            sent = str(pattern_postal.sub('**PHIPostal**', sent))

            if pattern_devid.findall(sent) != []:
                safe = False
                for item in pattern_devid.findall(sent):
                    if (re.search(r'\d', item) is not None
                            and re.search(r'[A-Z]', item) is not None):
                        screened_words.append(item)
                        sent = sent.replace(item, '**PHI**')

            # number check
            if pattern_number.findall(sent) != []:
                safe = False
                for item in pattern_number.findall(sent):
                    # print(item)
                    #if pattern_date.match(item[0]) is None:
                    sent = sent.replace(item[0], '**PHI**')
                    screened_words.append(item[0])
                    #print(item[0])
            #sent = str(pattern_number.sub('**PHI**', sent))
            '''
            if pattern_date.findall(sent) != []:
                safe = False
                for item in pattern_date.findall(sent):
                    if '-' in item[0]:
                        if (len(set(re.findall(r'[^\w\-]',item[0]))) <= 1):
                            screened_words.append(item[0])
                            #print(item[0])
                            sent = sent.replace(item[0], '**PHIDate**')
                    else:
                        if len(set(re.findall(r'[^\w]',item[0]))) == 1:
                            screened_words.append(item[0])
                            #print(item[0])
                            sent = sent.replace(item[0], '**PHIDate**')
            '''
            data_list = []
            if pattern_date.findall(sent) != []:
                safe = False
                for item in pattern_date.findall(sent):
                    if '-' in item[0]:
                        if (len(set(re.findall(r'[^\w\-]', item[0]))) <= 1):
                            #screened_words.append(item[0])
                            #print(item[0])
                            data_list.append(item[0])
                            #sent = sent.replace(item[0], '**PHIDate**')
                    else:
                        if len(set(re.findall(r'[^\w]', item[0]))) == 1:
                            #screened_words.append(item[0])
                            #print(item[0])
                            data_list.append(item[0])
                            #sent = sent.replace(item[0], '**PHIDate**')
            data_list.sort(key=len, reverse=True)
            for item in data_list:
                sent = sent.replace(item, '**PHIDate**')

            #sent = str(pattern_date.sub('**PHI**', sent))
            #print(sent)
            if pattern_4digits.findall(sent) != []:
                safe = False
                for item in pattern_4digits.findall(sent):
                    screened_words.append(item)
            sent = str(pattern_4digits.sub('**PHI**', sent))
            # email check
            if pattern_email.findall(sent) != []:
                safe = False
                for item in pattern_email.findall(sent):
                    screened_words.append(item)
            sent = str(pattern_email.sub('**PHI**', sent))
            # url check
            if pattern_url.findall(sent) != []:
                safe = False
                for item in pattern_url.findall(sent):
                    #print(item[0])
                    if (re.search(r'[a-z]', item[0]) is not None
                            and '.' in item[0]
                            and re.search(r'[A-Z]', item[0]) is None
                            and len(item[0]) > 10):
                        print(item[0])
                        screened_words.append(item[0])
                        sent = sent.replace(item[0], '**PHI**')
                        #print(item[0])
            #sent = str(pattern_url.sub('**PHI**', sent))
            # dob check
            '''
            re_list = pattern_dob.findall(sent)
            i = 0
            while True:
                if i >= len(re_list):
                    break
                else:
                    text = ' '.join(re_list[i][0].split(' ')[-6:])
                    if re.findall(r'\b(birth|dob)\b', text, re.I) != []:
                        safe = False
                        sent = sent.replace(re_list[i][1], '**PHI**')
                        screened_words.append(re_list[i][1])
                    i += 2
            '''

            # Begin Step 4
            # substitute spaces for special characters
            sent = re.sub(r'[\/\-\:\~\_]', ' ', sent)
            # label all words for NER using the sentence level context.
            spcy_sent_output = nlp(sent)
            # split sentences into words
            sent = [word_tokenize(sent)]
            #print(sent)
            # Begin Step 5: context level pattern matching with regex
            for position in range(0, len(sent[0])):
                word = sent[0][position]
                # age check
                if word.isdigit() and int(word) > 90:
                    if position <= 2:  # check the words before age
                        word_previous = ' '.join(sent[0][:position])
                    else:
                        word_previous = ' '.join(sent[0][position -
                                                         2:position])
                    if position >= len(
                            sent[0]) - 2:  # check the words after age
                        word_after = ' '.join(sent[0][position + 1:])
                    else:
                        word_after = ' '.join(sent[0][position + 1:position +
                                                      3])

                    age_string = str(word_previous) + str(word_after)
                    if pattern_age.findall(age_string) != []:
                        screened_words.append(sent[0][position])
                        sent[0][position] = '**PHI**'
                        safe = False

                # address check
                elif (position >= 1 and position < len(sent[0]) - 1 and
                      (word.lower() in address_indictor or
                       (word.lower() == 'dr' and sent[0][position + 1] != '.'))
                      and (word.istitle() or word.isupper())):

                    if sent[0][position - 1].istitle() or sent[0][position -
                                                                  1].isupper():
                        screened_words.append(sent[0][position - 1])
                        sent[0][position - 1] = '**PHI**'
                        i = position - 1
                        # find the closet number, should be the number of street
                        while True:
                            if re.findall(r'^[\d-]+$', sent[0][i]) != []:
                                begin_position = i
                                break
                            elif i == 0 or position - i > 5:
                                begin_position = position
                                break
                            else:
                                i -= 1
                        i = position + 1
                        # block the info of city, state, apt number, etc.
                        while True:
                            if '**PHIPostal**' in sent[0][i]:
                                end_position = i
                                break
                            elif i == len(sent[0]) - 1:
                                end_position = position
                                break
                            else:
                                i += 1
                        if end_position <= position:
                            end_position = position

                        for i in range(begin_position, end_position):
                            #if sent[0][i] != '**PHIPostal**':
                            screened_words.append(sent[0][i])
                            sent[0][i] = '**PHI**'
                            safe = False

            # Begin Step 6: NLTK POS tagging
            sent_tag = nltk.pos_tag_sents(sent)
            #try:
            # senna cannot handle long sentence.
            #sent_tag = [[]]
            #length_100 = len(sent[0])//100
            #for j in range(0, length_100+1):
            #[sent_tag[0].append(j) for j in pretrain.tag(sent[0][100*j:100*(j+1)])]
            # hunpos needs to change the type from bytes to string
            #print(sent_tag[0])
            #sent_tag = [pretrain.tag(sent[0])]
            #for j in range(len(sent_tag[0])):
            #sent_tag[0][j] = list(sent_tag[0][j])
            #sent_tag[0][j][1] = sent_tag[0][j][1].decode('utf-8')
            #except:
            #print('POS error:', tail, sent[0])
            #sent_tag = nltk.pos_tag_sents(sent)
            # Begin Step 7: Use both NLTK and Spacy to check if the word is a name based on sentence level NER label for the word.
            for ent in spcy_sent_output.ents:  # spcy_sent_output contains a dict with each word in the sentence and its NLP labels
                #spcy_sent_ouput.ents is a list of dictionaries containing chunks of words (phrases) that spacy believes are Named Entities
                # Each ent has 2 properties: text which is the raw word, and label_ which is the NER category for the word
                if ent.label_ == 'PERSON':
                    #print(ent.text)
                    # if word is person, recheck that spacy still thinks word is person at the word level
                    spcy_chunk_output = nlp(ent.text)
                    if spcy_chunk_output.ents != (
                    ) and spcy_chunk_output.ents[0].label_ == 'PERSON':
                        # Now check to see what labels NLTK provides for the word
                        name_tag = word_tokenize(ent.text)
                        # senna & hunpos
                        #name_tag = pretrain.tag(name_tag)
                        # hunpos needs to change the type from bytes to string
                        #for j in range(len(name_tag)):
                        #name_tag[j] = list(name_tag[j])
                        #name_tag[j][1] = name_tag[j][1].decode('utf-8')
                        #chunked = ne_chunk(name_tag)
                        # default
                        name_tag = pos_tag_sents([name_tag])
                        chunked = ne_chunk(name_tag[0])
                        for i in chunked:
                            if type(
                                    i
                            ) == Tree:  # if ne_chunck thinks chunk is NER, creates a tree structure were leaves are the words in the chunk (and their POS labels) and the trunk is the single NER label for the chunk
                                if i.label() == 'PERSON':
                                    for token, pos in i.leaves():
                                        if pos == 'NNP':
                                            name_set.add(token)

                                else:
                                    for token, pos in i.leaves():
                                        spcy_upper_output = nlp(token.upper())
                                        if spcy_upper_output.ents != ():
                                            name_set.add(token)

            # BEGIN STEP 8: whitelist check
            # sent_tag is the nltk POS tagging for each word at the sentence level.
            for i in range(len(sent_tag[0])):
                # word contains the i-th word and it's POS tag
                word = sent_tag[0][i]
                # print(word)
                # word_output is just the raw word itself
                word_output = word[0]

                if word_output not in string.punctuation:
                    word_check = str(pattern_word.sub('', word_output))
                    #if word_check.title() in ['Dr', 'Mr', 'Mrs', 'Ms']:
                    #print(word_check)
                    # remove the speical chars
                    try:
                        # word[1] is the pos tag of the word

                        if (((word[1] == 'NN' or word[1] == 'NNP')
                             or ((word[1] == 'NNS' or word[1] == 'NNPS')
                                 and word_check.istitle()))):
                            if word_check.lower() not in whitelist_dict:
                                screened_words.append(word_output)
                                word_output = "**PHI**"
                                safe = False
                            else:
                                # For words that are in whitelist, check to make sure that we have not identified them as names
                                if ((word_output.istitle()
                                     or word_output.isupper())
                                        and pattern_name.findall(word_output)
                                        != []
                                        and re.search(r'\b([A-Z])\b',
                                                      word_check) is None):
                                    word_output, name_set, screened_words, safe = namecheck(
                                        word_output, name_set, screened_words,
                                        safe)

                        # check day/year according to the month name
                        elif word[1] == 'CD':
                            if i > 2:
                                context_before = sent_tag[0][i - 3:i]
                            else:
                                context_before = sent_tag[0][0:i]
                            if i <= len(sent_tag[0]) - 4:
                                context_after = sent_tag[0][i + 1:i + 4]
                            else:
                                context_after = sent_tag[0][i + 1:]
                            #print(word_output, context_before+context_after)
                            for j in (context_before + context_after):
                                if pattern_mname.search(j[0]) is not None:
                                    screened_words.append(word_output)
                                    #print(word_output)
                                    word_output = "**PHI**"
                                    safe = False
                                    break
                        else:
                            word_output, name_set, screened_words, safe = namecheck(
                                word_output, name_set, screened_words, safe)

                    except:
                        print(word_output, sys.exc_info())
                    if word_output.lower()[0] == '\'s':
                        if phi_reduced[-7:] != '**PHI**':
                            phi_reduced = phi_reduced + word_output
                        #print(word_output)
                    else:
                        phi_reduced = phi_reduced + ' ' + word_output
                # Format output for later use by eval.py
                else:
                    if (i > 0
                            and sent_tag[0][i - 1][0][-1] in string.punctuation
                            and sent_tag[0][i - 1][0][-1] != '*'):
                        phi_reduced = phi_reduced + word_output
                    elif word_output == '.' and sent_tag[0][i - 1][0] in [
                            'Dr', 'Mr', 'Mrs', 'Ms'
                    ]:
                        phi_reduced = phi_reduced + word_output
                    else:
                        phi_reduced = phi_reduced + ' ' + word_output
            #print(phi_reduced)

            # Begin Step 8: check middle initial and month name
            if pattern_mname.findall(phi_reduced) != []:
                for item in pattern_mname.findall(phi_reduced):
                    screened_words.append(item[0])
            phi_reduced = pattern_mname.sub('**PHI**', phi_reduced)

            if pattern_middle.findall(phi_reduced) != []:
                for item in pattern_middle.findall(phi_reduced):
                    #    print(item[0])
                    screened_words.append(item[0])
            phi_reduced = pattern_middle.sub('**PHI** **PHI** ', phi_reduced)
        # print(phi_reduced)

        if not safe:
            phi_containing_records = 1

        # save phi_reduced file
        filename = '.'.join(tail.split('.')[:-1]) + "_" + key_name + ".txt"
        filepath = os.path.join(foutpath, filename)
        with open(filepath, "w") as phi_reduced_note:
            phi_reduced_note.write(phi_reduced)

        # save filtered words
        #screened_words = list(filter(lambda a: a!= '**PHI**', screened_words))
        filepath = os.path.join(foutpath, 'filter_summary.txt')
        #print(filepath)
        screened_words = list(
            filter(lambda a: '**PHI' not in a, screened_words))
        #screened_words = list(filter(lambda a: a != '**PHI**', screened_words))
        #print(screened_words)
        with open(filepath, 'a') as fout:
            fout.write('.'.join(tail.split('.')[:-1]) + ' ' +
                       str(len(screened_words)) + ' ' +
                       ' '.join(screened_words) + '\n')
            # fout.write(' '.join(screened_words))

        print(total_records, f,
              "--- %s seconds ---" % (time.time() - start_time_single))
        # hunpos needs to close session
        #pretrain.close()
        return total_records, phi_containing_records
                    EventStructures['Who'] = text
                elif 'A2' == arg:
                    EventStructures['Whom'] = text
                text = labels[i][1][0]
                Args.append(text)
        else:
            text += ' ' + labels[i][1][0]

    print(EventStructures)
    return Args


srltagger = SennaSRLTagger(path)
nertagger = SennaNERTagger(path)
chktagger = SennaChunkTagger(path)
tagger = SennaTagger(path)

#w = s.tag("Are you studying here?".split())
#w = s.tag("""A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED OPERATIONS..""".split())

#print(tagger.tag(sents))
#print('\n___________________\n')
#print(chktagger.tag(sents))
#print('\n___________________\n')
#print(nertagger.tag(sents))
#print('\n___________________\n')
#print(srltagger.tag(sents))
#print('\n___________________\n')
#text = sent
NE_Tagger(text)
#print('\n'.join(str(e) for e in NE_Tagger(sents)))
Пример #9
0
#define var
cnt = 0  #counter
flist = []  #file list
linked_file = ""  #linked file
ifile = ""  #input file data
splited_file = []  #splited file
taged_file = []  #taged file
f = ""  #filename

import os.path
import nltk

#import Senna Tagger
from nltk.tag import SennaTagger

tagger = SennaTagger('/usr/share/senna-v3.0')

#loop
while True:
    #import data
    cnt += 1
    ifile = input("please inputfile" + str(cnt) +
                  "(e to end input / q to quit) : ")

    #escape from loop
    if ifile == "e":
        break

    elif ifile == "q":
        quit()
Пример #10
0
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.tag import SennaTagger
from nltk import ne_chunk, pos_tag, defaultdict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
'''--------------------------------------Exercise 1------------------------------------------------------------------'''
''' Exercise sheet Lab1_TextProcessing.pdf '''

#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')

vectorizer = CountVectorizer()
'''----------------SENNA Tagger-------------------'''
sena_tagger = SennaTagger(
    '/home/starksultana/Documentos/MEIC/5o_ano/1o semestre/PRI/Labs/lab1/senna-v3.0/senna'
)

print("----------------EXERCISE 1-------------------")

#Exercise 1.1


def partition(A, low, high):
    pivot = A[low]
    leftwall = low
    for i in range(low, high + 1):  #ns s ta bem no pseudo dizia low+1
        if (A[i] < pivot):
            leftwall += 1
            A[leftwall], A[i] = A[i], A[leftwall]
    A[leftwall], A[low] = A[low], A[leftwall]
Пример #11
0
# -*- coding: utf-8 -*-
"""
Created on Sun May 14 12:37:50 2017

@author: Shanika Ediriweera
"""

from nltk import word_tokenize
from nltk.tag import SennaTagger
senna = SennaTagger('../../tools/senna')
sents = ["All the banks are closed", "Today is Sunday"]

tokenized_sents = [word_tokenize(sent) for sent in sents]
print(senna.tag_sents(tokenized_sents))
Пример #12
0
 def __init__(self):
     self.tagger = SennaTagger('/app/util/senna')