def namedEntityRecognizer():
    echo2("Performing NER on incoming stream")
    content = request.stream.read()
    #print content

    if Verbose:
        echo2("Incoming content is "+content)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag
    start = time.time()
    #date_time = timex.tag(content)
    tokenized = nltk.word_tokenize(content)
    tagged = pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    names = extract_entity_names(namedEnt, 'NE')
    #names.extend(date_time)
    result = {"result" : "success", "names" : names}
    if Units:
        grammar = '''unit: {<CD><NNS>?<NN.*>?},
                     unit: {<CD><JJ>?<NN.*>}
                  '''
        parser = nltk.RegexpParser(grammar)
        units = extract_entity_names(parser.parse(tagged),'unit')
        result['units'] = units
    jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
    end = time.time()
    print "NER took "+str(end - start)+" seconds"
    return jsonDoc
Exemplo n.º 2
0
class NLTKTagger:
    '''
	class that supplies part of speech tags using NLTK
	note: avoids the NLTK downloader (see __init__ method)
	'''
    def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        #return pkgutil.get_data('scattertext',
        #                        'data/viz/semiotic_new.html').decode('utf-8')
        path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/'
        tokenizer_fn = path + 'punkt.english.pickle'
        tagger_fn = path + 'averaged_perceptron_tagger.pickle'
        #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
        #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)

    # http://www.nltk.org/book/ch05.html
    def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed

        sents = self.sent_detector.tokenize(
            text
        )  # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {
            'tokens': all_tokens,
            'pos': [tag for (w, tag) in word_pos_pairs]
        }

    def tag_tokens(self, tokens):
        word_pos_pairs = self.tagger.tag(tokens)
        return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
    def syllable_pos_setup(self):
        """Sets up syllables and POS tagging"""
        en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US',
                   'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU',
                   'en_GH', 'en_ZW', 'en_GB']

        for lang in en_list:
            if not dictools.is_installed(lang): dictools.install(lang)

        self.cmu_dict = cmudict.dict()

        # sets up POS
        try:
            nltk.pos_tag(['test'])
            self.pos_tag = nltk.pos_tag
        except urllib2.URLError:
            PICKLE = "averaged_perceptron_tagger.pickle"
            AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
            tagger = PerceptronTagger(load=False)
            tagger.load(AP_MODEL_LOC)
            self.pos_tag = tagger.tag

        self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb',
                         'IN': 'Preposition', 'CC': 'Conjunction',
                         'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector',
                         'RB': 'Adverb', 'WR': 'Wh-adverb',
                         'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro',
                         'CD': 'Cardinal',
                         'EX': 'Existential there'}

        ##        self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb',
        ##          'IN':'Preposition','PR':'Pronoun','CC':'Conjunction',
        ##          'RP':'Particle','WR':'Wh-adverb','DT':'Determiner',
        ##          'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer',
        ##          'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'}

        # POS which are allowed to happen twice in a row
        self.pos_double = []  # ['Noun','Adjective']

        # POS which can only occur sequentially
        # i.e. an Adverb must occur in fron of a verb
        self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'],
                         'Preposition': ['Noun', 'Pronoun']}

        # POS which cannot occur sequentially
        # i.e. a preposition cannot come before a verb
        self.pos_restrict_lead = {'Preposition': 'Verb',}

        return
Exemplo n.º 4
0
class NLTKTagger:
	'''
	class that supplies part of speech tags using NLTK
	note: avoids the NLTK downloader (see __init__ method)
	'''

	def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html
	def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}

	def tag_tokens(self, tokens):
		word_pos_pairs = self.tagger.tag(tokens)
		return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
    def get_keyphrases(self, textInput, min_freq=2):

        # setting up tagger
        # (from http://stackoverflow.com/a/35964709)
        PICKLE = "averaged_perceptron_tagger.pickle"
        AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE))
        tagger = PerceptronTagger(load=False)
        tagger.load(AP_MODEL_LOC)

        lemmatizer = nltk.WordNetLemmatizer()
        stemmer = nltk.stem.porter.PorterStemmer()

        # This grammar is described in the paper by S. N. Kim,
        # T. Baldwin, and M.-Y. Kan.
        # Evaluating n-gram based evaluation metrics for automatic
        # keyphrase extraction.
        # Technical report, University of Melbourne, Melbourne 2010.

        StopWords = stopwords.words('english')

        def leaves(tree):
            """Finds NP (nounphrase) leaf nodes of a chunk tree."""
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):
                yield subtree.leaves()

        def acceptable_word(word):
            """Checks conditions for acceptable word: length, stopword."""
            accepted = bool(2 < len(word) and word.lower() not in StopWords)
            return accepted

        def normalise(word):
            """Normalises words to lowercase and stems and lemmatizes it."""
            word = word.lower()
            word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            return word

        def get_terms(tree):
            for leaf in leaves(tree):
                # can modify normalise to w.lower() if dont want to normalize word
                term = [normalise(w) for w, t in leaf if acceptable_word(w)]
                yield term

        def get_nounPhrases(textInput, minWordLength=2):

            grammar = r"""

            NBAR:
                {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
                      """

            chunker = nltk.RegexpParser(grammar)

            toks = nltk.word_tokenize(textInput)
            # print(toks)
            pos_tag = tagger.tag
            postoks = pos_tag(toks)

            tree = chunker.parse(postoks)
            terms = get_terms(tree)

            nounPhraseList = []
            for tid, term in enumerate(terms):
                templist = []
                for wid, word in enumerate(term):
                    # print("TID: ",tid," WID: ",(wid+1), word)
                    templist.append(word)

                s = " "
                nounPhraseList.append(s.join(templist))

            nounPhraseList = [word for word in nounPhraseList if len(word.split()) >= minWordLength]
            return nounPhraseList

        counter = Counter()
        for nounPhrase in get_nounPhrases(textInput):
            # print(nounPhrase)
            counter.update([nounPhrase])

        keyphraseDF = pandas.DataFrame([[key, value] for key, value in counter.items() if value>=min_freq],
                                       columns=['keyphrase_stemmed', 'frequency'])
        (docsDF, occurrenceDF) = self.get_occurrence(keyphraseDF)
        print("docs", docsDF)
        print("keys", keyphraseDF)
        keyphraseDF = keyphraseDF.join(docsDF["docs"])
        print(occurrenceDF)
        keyphraseDF = keyphraseDF.join(self.get_fullphrases(keyphraseDF=keyphraseDF)["keyphrase_full"])
        keyphraseDF = keyphraseDF.join(self.get_MIs(occurrenceDF=occurrenceDF)["MI"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="positive")["PMI_pos"])
        keyphraseDF = keyphraseDF.join(
            self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="negative")["PMI_neg"])
        #keyphraseDF = keyphraseDF.join(self.get_PMIs(keyphraseDF["Keyphrase_stemmed"].tolist(), "neg"))

        return keyphraseDF
Exemplo n.º 6
0
from collections import Counter
#count_good_raw = Counter(good_raw)
count_good_actors = Counter(good_actors)
count_good_actions = Counter(good_actions)
#number of statements
nos = len(tokenized_actions)
#number of good actors
noga = len(count_good_actors)
#number of good actors
nogc = len(count_good_actions)

PICKLE = "taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle"
import nltk.data
from nltk.tag import PerceptronTagger
_nltk_pos_tagger = PerceptronTagger(load=False)
_nltk_pos_tagger.load(PICKLE)
print(count_good_actors)
S = np.zeros(shape=(nos, noga + nogc))
i = 0
for sent_pos in tokenized_actors:
    for token1 in sent_pos:
        j = 0
        tt1 = _nltk_pos_tagger.tag([token1])
        for feature in count_good_actors:
            ft = _nltk_pos_tagger.tag([feature])
            simval = word_sim(tt1[0], ft[0], i)
            S[i][j] = S[i][j] + simval
            j = j + 1
    i = i + 1

i = 0
import nltk
from nltk.tag import PerceptronTagger
from nltk.data import find

def extract_entity_names(t, label):
    entity_names = []
    if hasattr(t, 'label') and t.label:
        if t.label() == label:
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child, label))
    return entity_names

PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag
tokenized = nltk.word_tokenize('The quick brown fox  named Ron and Donald Trump jumps over the lazy dog')
tagged = pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary=True)
names = extract_entity_names(namedEnt, 'NE')
#check=pos_tag('The quick brown fox jumps over the lazy dog'.split())
print names

Exemplo n.º 8
0
from nltk.tag import PerceptronTagger
from nltk.data import find
import glob


#code for loading perceptron tagger 
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag


#list to store POS and NP lists generated from each file
GlobalPOSList=[]
GlobalNPList=[]

#getting filenames of dataset files
fileList=glob.glob("C:/Users/Vinod Chhapariya/Desktop/TDBMS/Benchmark Dataset/*.txt")

#printing filenames
for filename in fileList:
        print filename

#POS tagging using Preceptron tagger        
for filename in fileList:
        POSList=[]
        NPList=[]
        filePOSTagWrite=open(filename+"_POSTag_Perceptron",'w')
        for line in open(filename,'r').readlines():
                tags=pos_tag(line.split())