def namedEntityRecognizer(): echo2("Performing NER on incoming stream") content = request.stream.read() #print content if Verbose: echo2("Incoming content is "+content) PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag start = time.time() #date_time = timex.tag(content) tokenized = nltk.word_tokenize(content) tagged = pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt, 'NE') #names.extend(date_time) result = {"result" : "success", "names" : names} if Units: grammar = '''unit: {<CD><NNS>?<NN.*>?}, unit: {<CD><JJ>?<NN.*>} ''' parser = nltk.RegexpParser(grammar) units = extract_entity_names(parser.parse(tagged),'unit') result['units'] = units jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) end = time.time() print "NER took "+str(end - start)+" seconds" return jsonDoc
class NLTKTagger: ''' class that supplies part of speech tags using NLTK note: avoids the NLTK downloader (see __init__ method) ''' def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__) + '/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize( text ) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return { 'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs] } def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
def syllable_pos_setup(self): """Sets up syllables and POS tagging""" en_list = ['en_CA', 'en_PH', 'en_NA', 'en_NZ', 'en_JM', 'en_BS', 'en_US', 'en_IE', 'en_MW', 'en_IN', 'en_BZ', 'en_TT', 'en_ZA', 'en_AU', 'en_GH', 'en_ZW', 'en_GB'] for lang in en_list: if not dictools.is_installed(lang): dictools.install(lang) self.cmu_dict = cmudict.dict() # sets up POS try: nltk.pos_tag(['test']) self.pos_tag = nltk.pos_tag except urllib2.URLError: PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) self.pos_tag = tagger.tag self.tag_dict = {'NN': 'Noun', 'FW': 'Noun', 'JJ': 'Adjective', 'VB': 'Verb', 'IN': 'Preposition', 'CC': 'Conjunction', 'RP': 'Connector', 'TO': 'Connector', 'MD': 'Connector', 'RB': 'Adverb', 'WR': 'Wh-adverb', 'DT': 'DetPro', 'WD': 'DetPro', 'PD': 'DetPro', 'PR': 'DetPro', 'WP': 'DetPro', 'CD': 'Cardinal', 'EX': 'Existential there'} ## self.tag_dict={'NN':'Noun', 'JJ':'Adjective','RB':'Adverb','VB':'Verb', ## 'IN':'Preposition','PR':'Pronoun','CC':'Conjunction', ## 'RP':'Particle','WR':'Wh-adverb','DT':'Determiner', ## 'TO':'To','MD':'Modal Aux','CD':'Cardinal', 'PD':'Predeterminer', ## 'WD':'Wh-determiner', 'WP':'Wh-pronoun','EX':'Existential there'} # POS which are allowed to happen twice in a row self.pos_double = [] # ['Noun','Adjective'] # POS which can only occur sequentially # i.e. an Adverb must occur in fron of a verb self.pos_lead = {'Adverb': ['Verb'], 'Pronoun': ['Noun'], 'Adjective': ['Noun'], 'Preposition': ['Noun', 'Pronoun']} # POS which cannot occur sequentially # i.e. a preposition cannot come before a verb self.pos_restrict_lead = {'Preposition': 'Verb',} return
class NLTKTagger: ''' class that supplies part of speech tags using NLTK note: avoids the NLTK downloader (see __init__ method) ''' def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
def get_keyphrases(self, textInput, min_freq=2): # setting up tagger # (from http://stackoverflow.com/a/35964709) PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:' + str(find('taggers/averaged_perceptron_tagger/' + PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() # This grammar is described in the paper by S. N. Kim, # T. Baldwin, and M.-Y. Kan. # Evaluating n-gram based evaluation metrics for automatic # keyphrase extraction. # Technical report, University of Melbourne, Melbourne 2010. StopWords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): yield subtree.leaves() def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 < len(word) and word.lower() not in StopWords) return accepted def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) word = lemmatizer.lemmatize(word) return word def get_terms(tree): for leaf in leaves(tree): # can modify normalise to w.lower() if dont want to normalize word term = [normalise(w) for w, t in leaf if acceptable_word(w)] yield term def get_nounPhrases(textInput, minWordLength=2): grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = nltk.RegexpParser(grammar) toks = nltk.word_tokenize(textInput) # print(toks) pos_tag = tagger.tag postoks = pos_tag(toks) tree = chunker.parse(postoks) terms = get_terms(tree) nounPhraseList = [] for tid, term in enumerate(terms): templist = [] for wid, word in enumerate(term): # print("TID: ",tid," WID: ",(wid+1), word) templist.append(word) s = " " nounPhraseList.append(s.join(templist)) nounPhraseList = [word for word in nounPhraseList if len(word.split()) >= minWordLength] return nounPhraseList counter = Counter() for nounPhrase in get_nounPhrases(textInput): # print(nounPhrase) counter.update([nounPhrase]) keyphraseDF = pandas.DataFrame([[key, value] for key, value in counter.items() if value>=min_freq], columns=['keyphrase_stemmed', 'frequency']) (docsDF, occurrenceDF) = self.get_occurrence(keyphraseDF) print("docs", docsDF) print("keys", keyphraseDF) keyphraseDF = keyphraseDF.join(docsDF["docs"]) print(occurrenceDF) keyphraseDF = keyphraseDF.join(self.get_fullphrases(keyphraseDF=keyphraseDF)["keyphrase_full"]) keyphraseDF = keyphraseDF.join(self.get_MIs(occurrenceDF=occurrenceDF)["MI"]) keyphraseDF = keyphraseDF.join( self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="positive")["PMI_pos"]) keyphraseDF = keyphraseDF.join( self.get_PMIs(occurrenceDF=occurrenceDF, metric="sentiment_class", value="negative")["PMI_neg"]) #keyphraseDF = keyphraseDF.join(self.get_PMIs(keyphraseDF["Keyphrase_stemmed"].tolist(), "neg")) return keyphraseDF
from collections import Counter #count_good_raw = Counter(good_raw) count_good_actors = Counter(good_actors) count_good_actions = Counter(good_actions) #number of statements nos = len(tokenized_actions) #number of good actors noga = len(count_good_actors) #number of good actors nogc = len(count_good_actions) PICKLE = "taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle" import nltk.data from nltk.tag import PerceptronTagger _nltk_pos_tagger = PerceptronTagger(load=False) _nltk_pos_tagger.load(PICKLE) print(count_good_actors) S = np.zeros(shape=(nos, noga + nogc)) i = 0 for sent_pos in tokenized_actors: for token1 in sent_pos: j = 0 tt1 = _nltk_pos_tagger.tag([token1]) for feature in count_good_actors: ft = _nltk_pos_tagger.tag([feature]) simval = word_sim(tt1[0], ft[0], i) S[i][j] = S[i][j] + simval j = j + 1 i = i + 1 i = 0
import nltk from nltk.tag import PerceptronTagger from nltk.data import find def extract_entity_names(t, label): entity_names = [] if hasattr(t, 'label') and t.label: if t.label() == label: entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child, label)) return entity_names PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag tokenized = nltk.word_tokenize('The quick brown fox named Ron and Donald Trump jumps over the lazy dog') tagged = pos_tag(tokenized) namedEnt = nltk.ne_chunk(tagged, binary=True) names = extract_entity_names(namedEnt, 'NE') #check=pos_tag('The quick brown fox jumps over the lazy dog'.split()) print names
from nltk.tag import PerceptronTagger from nltk.data import find import glob #code for loading perceptron tagger PICKLE = "averaged_perceptron_tagger.pickle" AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) tagger = PerceptronTagger(load=False) tagger.load(AP_MODEL_LOC) pos_tag = tagger.tag #list to store POS and NP lists generated from each file GlobalPOSList=[] GlobalNPList=[] #getting filenames of dataset files fileList=glob.glob("C:/Users/Vinod Chhapariya/Desktop/TDBMS/Benchmark Dataset/*.txt") #printing filenames for filename in fileList: print filename #POS tagging using Preceptron tagger for filename in fileList: POSList=[] NPList=[] filePOSTagWrite=open(filename+"_POSTag_Perceptron",'w') for line in open(filename,'r').readlines(): tags=pos_tag(line.split())