def clean_words(tokens, filterStopwords=False, filterPos=None): cleanTokens = [] stopwordList = stopwords.words('spanish') if filterPos: tagger = StanfordPOSTagger('stanford/models/spanish.tagger', 'stanford/stanford-postagger.jar', encoding='utf8') for token in tokens: cleanToken = token for char in string.punctuation: cleanToken = cleanToken.replace(char, "") if filterPos and not filterStopwords: res = tagger.tag([cleanToken]) if len(res)>0: word, pos = res[0] if pos[0] in filterPos: cleanTokens.append(cleanToken) elif filterStopwords and not filterPos: if cleanToken not in stopwordList: cleanTokens.append(cleanToken) elif filterStopwords and filterPos: res = tagger.tag([cleanToken]) if len(res)>0: word, pos = res[0] if cleanToken not in stopwordList and pos[0] in filterPos: cleanTokens.append(cleanToken) elif not filterStopwords and not filterPos: cleanTokens.append(cleanToken) return cleanTokens
def tagWordsInSentences(self, studying, entry): '''Tags the part of speech for each word.''' jar_path = 'stanford-postagger-full/stanford-postagger.jar' if studying in self.english: words = parseWordsFromEntry(entry) tagged_words = tagWords(words) return tagged_words elif studying in self.japanese or self.korean or self.mandarin: #segmenter = TinySegmenter() #words = segmenter.tokenize(entry) rm = RakutenMA() tagged_words = rm.tokenize(entry) #mecab = Mecab() #tagged_words = mecab.pos(entry) return tagged_words else: if studying in self.spanish: model_path = 'stanford-postagger-full/models/spanish.tagger' words = parseWordsFromEntry(entry) elif studying in self.french: model_path = 'stanford-postagger-full/models/french.tagger' words = parseWordsFromEntry(entry) postagger = StanfordPOSTagger(model_path, jar_path, encoding='utf8') tagged_words = postagger.tag(words) return tagged_words
def posTagging(): #myNounPhrases = [] myCompletePOSStructure = [] a = ['NNP', 'NNPS'] #Avoid NN,NNS. Only NNP , NNPS for purpose of NER. print '######## POS' english_postagger = StanfordPOSTagger( './Masters-Passau/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger', './Masters-Passau/stanford-postagger-full-2016-10-31/stanford-postagger.jar') #abc = english_postagger.tag('Steve Jobs was Founder of Apple. He was born in United States of America'.split()) abc = english_postagger.tag('Who was the CEO of IBM'.split()) print abc for number in abc: #print number[0],number[1] someTup = (number[0].encode('utf8'),number[1].encode('utf8')) #print someTup myCompletePOSStructure.append(someTup) #print split1[0] + ' ' + split1[1] #print unicodedata.normalize('NFKD', split1[0]).encode('ascii','ignore') #print unicodedata.normalize('NFKD', split1[1]).encode('ascii', 'ignore') print myCompletePOSStructure for number in abc: if any(x in number for x in a): #print number split1 = str(number).split(',') split2 = str(split1[0]).split('u') # print split2[1].replace("'", "") myNounPhrases.append(number)
class POS_tagger_stanford(object): def __init__(self): """ Initializes the tagger object """ self.model = TAGGER_MODEL self.jar_file = POS_TAGGER_JAR_FILE self.tagger = StanfordPOSTagger(self.model, self.jar_file) self.tagger_type = STANFORD_TAGGER_NAME def get_tags(self, sentence): """ Gets the tags for tokenized sentence The full list of tags is available online: https://nlp.stanford.edu/software/spanish-faq.shtml Args: sentence (list): the sentence used to obtain the POS tags, each word is an element in the list Returns: tags (list): List containing both the word and its corresponding tag """ #tagger = self.get_tagger() tags = self.tagger.tag(sentence) return tags
def getUsername(message, *args): pos_tagger = StanfordPOSTagger(model, jar, encoding="utf-8") words = nltk.word_tokenize(message.lower()) tagged_words = pos_tagger.tag(words) sug_usernames = [] # Check if pervious username input is passed if len(args) > 0: previous_username = args[0] sug_usernames = [ word for word, tag in tagged_words if tag in ['NN', 'NNP', 'FW', 'NNPS'] and word != previous_username ] else: sug_usernames = [ word for word, tag in tagged_words if tag in ['NN', 'NNP', 'FW', 'NNPS'] ] if len(sug_usernames) > 0: if getSentenceSentiment(message) == 'pos': return sug_usernames[-1] else: return sug_usernames[ -1] + 'salt123' # return last suggested username return 'randomuser567user'
def extractor(): st = StanfordPOSTagger( '../stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger', '../stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar') nouns = [] pnouns = [] i = 0 with open('../data/scraped_text_NYT.txt', 'r', encoding='utf-8') as inputFile: comment = inputFile.readline() while comment != "": sentences = sent_tokenize(comment, 'english') for sent in sentences: if (sent.strip() == ""): continue pos_tags = st.tag(sent.split()) for pos_tag in pos_tags: if (pos_tag[1] == 'NN' or pos_tag[1] == 'NNS'): nouns = nouns + [pos_tag[0]] elif (pos_tag[1] == 'NNP' or pos_tag[1] == 'NNPS'): pnouns = pnouns + [pos_tag[0]] i = i + 1 print(i) print(comment) comment = inputFile.readline() outFile = open('../data/nouns_scraped_text_NYT.txt', 'a') outFile.write('NOUNS:\n') for noun in nouns: outFile.write(noun + "\n") outFile.write('\n\nPNOUNS:\n') for pnoun in pnouns: outFile.write(pnoun + '\n')
def tagged_def(): java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path tagger = StanfordPOSTagger( 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger', 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger' ) path_data = "data" + os.sep + "items_tagged_modified.json" data = json.load(codecs.open(path_data, encoding='UTF-8')) for item in data: pos2definition = item["pos2definition"] for pos2def in pos2definition: definition = pos2def["definition"] # print chardet.detect(definition) print definition.encode('gbk') definition_pure = re.sub(r'\([\s\S]*?\)', "", definition) tokens = nltk.word_tokenize(definition_pure) # print tokens for token in tokens: print chardet.detect(token) tagged_tokens = tagger.tag(definition_pure.encode('utf-8').split()) pos2def['tagged_def'] = tagged_tokens path_tagged_output = "items_tagged_auto.json" json.dump(data, codecs.open(path_tagged_output, 'w', 'utf-8'), ensure_ascii=False, indent=2)
def test_POSSent(): import nltk from nltk.tag.stanford import StanfordPOSTagger java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path tagger = StanfordPOSTagger( 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger', 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/stanford-postagger-3.7.0.jar' ) sent = 'abutment is a tooth, root, or implant used for support and retention of a fixed or removable prosthesis.' sent = 'angulated abutment is an abutment whose body is not parallel to the long axis of the implant. It is utilized when the implant is at a different inclination in relation to the proposed prosthesis.' sent = u'substance abuse is the misuse of legal or illegal substances with the intent to alter some aspect of the user閳ユ獨 experience. May include medications, illicit drugs, legal substances with potential mood-altering effects, or substances whose primary use may not be for human consumption.' # print chardet.detect(sent) tokens = nltk.word_tokenize(sent) # print tagger.tag(sent.split()) print tagger.tag(tokens)
def test_StanfordAndNLTKPOS(): import nltk from nltk.tag.stanford import StanfordPOSTagger sent = 'a low-calorie sweetener that reduces caries activity and the growth and transmission of S. mutans.' sent = 'a wire formed by drawing a cast structure through a die; used in dentistry for partial denture clasps and orthodontic appliances.' sent = 'readily stained with acid dyes.' print chardet.detect(sent) # sent='technique metered spray refers to a topical anesthetic dispersal technique that controls the amount and rate at which a drug is administered.' # sent='older term for a traumatic ulcer of the oral mucosa.' # sent='one or more vertically parallel surfaces of abutment teeth shaped to direct the path of placement and removal of a remarkable partial denture. Also called guiding plane.' # sent='agents that bond, seal, or cement particles or objects together.' # sent='teeth that are at such an angle as to cause them to be out of centric contact with opposing teeth during occlusion.' start = datetime.now() text = nltk.word_tokenize(sent) nltk_pos = nltk.pos_tag(text) java_path = "C:/ProgramData/Oracle/Java/javapath" os.environ['JAVAHOME'] = java_path stanford_tagger = StanfordPOSTagger( 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger', 'F:/eclipse_doctor/KnowledgeGraph/stanford-pos/english-bidirectional-distsim.tagger' ) stanford_pos = stanford_tagger.tag(text) print 'nltk_pos: ' + str(nltk_pos) print 'stanford_pos: ' + str(stanford_pos)
class POS(object): """Part of Speech tagging using Stanford POSTagger""" STANFORD_POS = os.path.join(PACKAGE_ROOT, 'language', 'stanford-pos') STANFORD_POS_JAR = os.path.join(STANFORD_POS, 'stanford-postagger.jar') STANFORD_POS_TAGGER = os.path.join( STANFORD_POS, 'models/english-bidirectional-distsim.tagger') def __init__(self): self._tagger = StanfordPOSTagger(POS.STANFORD_POS_TAGGER, path_to_jar=POS.STANFORD_POS_JAR) def tag(self, tokens): """ Tag Part of Speech using Stanford NER Parameters ---------- tokens Returns ------- POS: list of tuples of strings """ return self._tagger.tag(tokens)
def pos_tagging(docs, stanford_path, pos_tagger): print("\nGenerating Part-of-Speech tags...") # Configuring Stanford NLP POS tagger path_to_model = "{}/models/{}.tagger".format(stanford_path, pos_tagger) path_to_jar = "{}/stanford-postagger.jar".format(stanford_path) tagger = StanfordPOSTagger(model_filename=path_to_model, path_to_jar=path_to_jar) # Setting higher memory limit for long sentences tagger.java_options = '-mx8192m' data = [] for doc in progressbar.progressbar(docs): # Obtain the list of tokens in the document tokens = [t for t, label in doc] try: # Perform POS tagging tagged = tagger.tag(tokens) except: continue # Take the word, POS tag, and its label data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)]) return data
def part_of_speech_tagging( self, words: List[str], multi_word_name_entities: Set[str]) -> List[Tuple[str, str]]: """ perform part-of-speech tagging using StanfordPOSTagger :param words: a list of words in a sentence :param multi_word_name_entities: a set of multi-word name entities :return: part-of-speech tag of the sentence """ # define pos tagger path_to_model = 'stanford/pos/english-bidirectional-distsim.tagger' path_to_jar = 'stanford/pos/stanford-postagger.jar' pos_tagger = StanfordPOSTagger(path_to_model, path_to_jar) stan_pos_tag = pos_tagger.tag(words[:-1]) # omit the last period normal_pos_tag = nltk.pos_tag(words[:-1]) # omit the last period # print('Stanford POS tagging:', stan_pos_tag) # for comparison # print('nltk.pos_tag tagging:', normal_pos_tag) # for comparison def post_treatment(stan_pos_tag: List[Tuple[str, str]], norm_pos_tag: List[Tuple[str, str]], multi_word_name_entities: Set[str]) -> None: """ combine the multi-word name entities since nltk.pos_tag label multi-word name entities together, so I correct stan_pos_tag by using norm_pos_tag the problem of norm_pos_tag is that it usually mislabels words, and that's why I prefer to use StanfordPOStagger :param stan_pos_tag: a list of pos-tags of sentences using stanford pos tagger :param norm_pos_tag: a list of pos-tags of sentences using nltk.pos_tag """ stan_len = len(stan_pos_tag) norm_len = len(normal_pos_tag) stan_i = 0 norm_i = 0 while stan_i < stan_len and norm_i < norm_len: stan_word, stan_pos = stan_pos_tag[stan_i] norm_word, norm_pos = norm_pos_tag[norm_i] # check if word exists in multi_word_name_entities if stan_word == norm_word.split( ' ')[0] and norm_word in multi_word_name_entities: # scan the following words in stan_pos_tag and combine if they can form a multi-word entity temp_i = stan_i + 1 match_idx = 1 entities = norm_word.split(' ') while temp_i < stan_len and match_idx < len(entities): temp_word, temp_pos = stan_pos_tag[temp_i] if temp_word == entities[match_idx]: _ = stan_pos_tag.pop(temp_i) match_idx += 1 else: break stan_pos_tag[stan_i] = (norm_word, stan_pos) stan_i += 1 norm_i += 1 post_treatment(stan_pos_tag, normal_pos_tag, multi_word_name_entities) return stan_pos_tag
class POSTagger: """POSTagger creates a POS tagger for german language. Different tagger are available to use.""" STAN = "stanford-hgc-tagger" SFT = "stanford-fast-tagger" TT = "tree-tagger" SPACY = "spacy-tagger" # paths to Stanford tagger modules __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar" __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/" def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.") def tag(self, text): """POS tag tokenized text.""" if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN: tokens = self.__tokenizer.tokenize(text) return self.__tagger.tag(tokens) elif self.tagger_name == POSTagger.TT: tags = self.__tagger.tag_text(text) tuple_list = [] tag_list = treetaggerwrapper.make_tags(tags) for item in tag_list: tuple_list.append((item[0], item[1])) return tuple_list elif self.tagger_name == POSTagger.SPACY: tags = self.__tagger(text) tuple_list = [] for word in tags: tuple_list.append((word.orth_, word.tag_)) return tuple_list else: pass #tagger = POSTagger("spacy-tagger") #doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.") #print(tagger.tag("Ich werde morgen in die Schule gehen.")) #print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def getTags(sen_arr): tag_arr = [] st = StanfordPOSTagger('english-left3words-distsim.tagger') res = st.tag(sen_arr) for i in res: tag = i[1].encode("utf-8") tag_arr.append(tag) return tag_arr
def determine_sentpos_by_nltk(self, sentence): ''' get pos collection for sentence from nltk ''' pos_model_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/models/english-bidirectional-distsim.tagger" pos_jar_file = "C://python34/ProjectDragonWolf/nlp_res/stanford_pos/stanford-postagger.jar" pos = StanfordPOSTagger(model_filename=pos_model_file, path_to_jar=pos_jar_file) return pos.tag(sentence.split(" "))
def pos_tagger(text): from nltk.tag.stanford import StanfordPOSTagger english_postagger = StanfordPOSTagger( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/models/english-left3words-distsim.tagger', 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar' ) english_postagger.java_options = '-mx4096m' tags = english_postagger.tag(text) return tags
def pos_tag(to_tag, stanford_postagger_path): '''tag the tokens with part of speech; to_tag is the tags; model_path is the file path to the stanford POS tagger model; and jar_path to the Stanford POS tagger jar file''' pos_tagger = StanfordPOSTagger(stanford_postagger_path +"\\models\\french.tagger", stanford_postagger_path +"\\stanford-postagger.jar", encoding='utf8') #create an object of class POSTagger that is encoded in UTF-8 tags = pos_tagger.tag(to_tag) #run the tagging algorithm on the tokenized raw text return tags
def posInput(text): print("POS") path_to_model = "./stanford-postagger/models/english-caseless-left3words-distsim.tagger" path_to_jar = "./stanford-postagger/stanford-postagger.jar" tagger=StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options='-mx4096m' ### Setting higher memory limit for long sentences # sentence = 'THIS IS TESTING' result = tagger.tag(word_tokenize(text)) # print result return result
def transform_wnli(premise,hypothesis): cased_premise=premise premise=[w.lower() for w in nltk.word_tokenize(premise)] #transform WNLI examples back into WSC format hypothesis = [w.lower() for w in nltk.word_tokenize(hypothesis)] best_target=["","","","","",""]#should get overwritten best_masked_s=[] for l in range(len(hypothesis)): for r in range(l+1,l+6): left_part = hypothesis[:l] right_part = hypothesis[r:] pattern = left_part + ["_"]+ right_part for s in range(len(premise)): ok=True if s+len(pattern)>len(premise): break for a,b in zip(pattern,premise[s:s+len(pattern)]): if a=="_": continue if a==b: continue if a in [',','.','?','!'] and b in [',','.','?','!']:#punctuation is ignored continue ok=False break if ok and len(hypothesis[l:r])<=len(best_target): best_target = hypothesis[l:r] best_masked_s = premise[:s]+pattern+premise[s+len(pattern):] if len(best_masked_s)==0:#We failed return None,None #We extracted the masked sentence from the premise. global POS_tagger if POS_tagger is None: os.environ['STANFORD_MODELS'] = "stanford-postagger-2018-10-16/models" os.environ['CLASSPATH'] = "stanford-postagger-2018-10-16" POS_tagger = StanfordPOSTagger("stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger") tagged_premise = POS_tagger.tag(nltk.word_tokenize(cased_premise)) candidates = [] current=[] for word,tag in tagged_premise: if tag in ["NN","NNS","NNP","NNPS"]: current.append(word) else: if current!=[]: candidates.append(" ".join(current).lower()) current=[] if current!=[]: candidates.append(" ".join(current).lower()) best_target=" ".join(best_target) candidates=[c for c in candidates if c.find(best_target)==-1 and best_target.find(c)==-1] candidates = [best_target]+candidates found_sentence = " ".join(best_masked_s).replace(" n't","n't").replace(" 's","'s")#Sorry nltk return found_sentence,candidates
def create_pos(self, tweet): self.pos_tweet = None tweet = word_tokenize(tweet.lower()) english_pos = StanfordPOSTagger( 'postagger/models/english-bidirectional-distsim.tagger', 'postagger/stanford-postagger.jar') self.pos_tweet = english_pos.tag(tweet) return self.pos_tweet
def tag(tokens): #java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe" #os.environ['JAVAHOME'] = java_path special_symbols_array = ["the", "a", "an"] english_postagger = StanfordPOSTagger( 'tagger/english-bidirectional-distsim.tagger', 'tagger/stanford-postagger.jar') token_tag_array = english_postagger.tag(tokens) for element in token_tag_array: if element[0].lower() in special_symbols_array: token_tag_array.remove(element) return token_tag_array
class String2POSNGramsList(String2TokenList): def __init__(self, n=1, tagger_cls='english-left3words-distsim.tagger'): # Other Taggers: # 1. 'english-bidirectional-distsim.tagger' # 2. 'english-left3words-distsim.tagger' super(String2POSNGramsList, self).__init__() # N-Grams size self.n = n # Tagger Class Selection... See detail in Stanford Tagger documentation. self.tagger_cls = tagger_cls # Getting the Stanford tagger instance. self.spt = StanfordPOSTagger(self.tagger_cls) # self.spt = CoreNLPPOSTagger(url='http://localhost:9000') self.spt.java_options = '-mx10g' @property def N(self): return self.n @N.setter def N(self, value): self.n = value @property def Tagger_cls(self): return self.n @Tagger_cls.setter def Tagger_cls(self, value): self.tagger_cls = value def terms_lst(self, text): # Getting the Analysed list of tokens. analyzed_terms_lst = self.token_lst(text) # Tagging the Analyzed terms list and getting the tags list as terms. pos_tags = [pos for t, pos in self.spt.tag(analyzed_terms_lst)] # Constructing the Words N-Grams List analyzed_terms_lst = [ " ".join(pos_tags[i: i+self.n]) for i in range(len(pos_tags) - self.n + 1) ] return analyzed_terms_lst
class StanfordPOS(): def __init__(self, model_filename, jarfile): self.model_filename = model_filename self.path_to_jar = jarfile self.tager = StanfordPOSTagger(model_filename=self.model_filename, path_to_jar=self.path_to_jar) def tagger(self, X): transformed_X = [] for doc in X: res = self.tager.tag(doc) transformed_X.append(np.array(res)) return transformed_X
def _POS(self, txt, id): self.df[['ID', 'pos']].to_csv('pos_ner.csv', sep='\t') path_pos = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/stanford-postagger.jar' model_path = '/home/ise/NLP/stanfordNLP/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger' from nltk.tag.stanford import StanfordPOSTagger tagger = StanfordPOSTagger(model_path, path_pos) tagger.java_options = '-mx8096m' ### Setting higher memory limit for long sentences tokens = nltk.word_tokenize(txt) pos_res = tagger.tag(tokens) filepath = '/home/ise/NLP/NLP3/pos/pos_{}.txt'.format(id) with open(filepath, 'w') as file_handler: for item in pos_res: file_handler.write("{}\n".format(item)) return pos_res
def transform_to_pos(text): import os #os.environ['JAVAHOME'] = java_path from nltk.corpus import sentiwordnet as swn from nltk.tag.stanford import StanfordPOSTagger from nltk import word_tokenize path_to_model = "./postagging/english-bidirectional-distsim.tagger" path_to_jar = "./postagging/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences tokens = word_tokenize(text) size = len(tokens) from collections import Counter pos = tagger.tag(tokens) counts = Counter(tag for word, tag in pos) for key in counts: counts[key] /= size counts["totalWordsCount"] = size counts[";"] = tokens.count(";") / size counts["questionmarks"] = tokens.count("?") / size counts["exclamationmarks"] = tokens.count("!") / size counts["Quotes"] = tokens.count("\"") / size try: counts.pop(".") except: pass from collections import OrderedDict ot = [ 'NNP', 'VBD', 'VBN', 'IN', 'CD', 'VBP', ',', 'DT', 'NN', 'JJ', 'RB', 'TO', 'SYM', 'PRP', 'NNS', 'CC', 'PRP$', 'POS', 'FW', 'VBG', ':', 'WRB', 'EX', 'JJR', 'WDT', 'totalWordsCount', ';', 'questionmarks', 'exclamationmarks', 'Quotes' ] counts = OrderedDict(counts) for key in ot: if key in counts: pass else: counts[key] = 0 tmp = counts.copy() for key in tmp: if key not in ot: counts.pop(key, None) dab = {} for i in ot: dab[i] = counts[i] counts = dab.copy() return counts
def extractPOS(inputFile_data, inputFile_tags, inputFile_version, outputFile_pos): f = open(inputFile_tags) allTags = set(f.read().split(",")) # Load all tags f.close() f = open(inputFile_version) lines = f.readlines() f.close() tag_version = [] # tags with version number for index, row in enumerate(lines): items = row.strip().split() if items[0] in allTags: for tag in items[1].split(","): tag_version.append(tag) print "The number of tag_version is: ", len(tag_version) tag_version = set(tag_version) fw_pos = open(outputFile_pos, "w") english_postagger = StanfordPOSTagger( '/Users/songshuaichen/Downloads/jars/models/english-bidirectional-distsim.tagger' ) f = open(inputFile_data) lines = f.readlines() f.close() for index, row in enumerate(lines): if index % 300 == 0: print index, " Finish ", float(index) / len(lines) items = row.strip().split(" ") # if index >=5000 and index < 6000 and items[0] in tag_version: if items[0] in tag_version: fw_pos.write(str(index) + " " + items[0] + " \n") if items[0] not in tag_version: fw_pos.write(str(index) + " " + items[0] + " ") if len(items) > 1: text = items[1].split(". ")[0].decode('utf-8') pos = english_postagger.tag(text.split()) for p in pos: fw_pos.write(str(p)) fw_pos.write(" ") fw_pos.write("\n") fw_pos.close()
def build_question_set(): sv_file = 'data/kprestval_pos_tags.json' st = StanfordPOSTagger('english-bidirectional-distsim.tagger') meta = load_and_process_metadata('val') images = split_data_by_seed(meta, 'kprestval') num = len(images) pos_tags_dict = {} for i, info in enumerate(images): question_id = info.question_id question = info.question.lower() _pos_tags = st.tag(word_tokenize(question)) pos_tags_dict[question_id] = _pos_tags print('\nPOS TAGGER: %d/%d' % (i, num)) print(_pos_tags) save_json(sv_file, {'pos_tags': pos_tags_dict})
class PosTaggerTest(object): def __init__(self): self.eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') def tag(self,sentence): tknzr = TweetTokenizer() res = self.eng_tagger.tag([' '.join(tknzr.tokenize(sentence))]) return res def show(self,sentence): res = dict(self.tag(sentence)) for key in res: # print sentence print key,"\t",res[key]
def pos_clean(text_list,java_path,stanford_tagger_path,remove_verbs=True): import os #java_path = "C:/Program Files/Java/jdk1.8.0_261/bin/java.exe" os.environ['JAVAHOME'] = java_path #stan_path = "C:/Users/אילנה/Dropbox/jupyter_notebooks/data-science/idc-research/mine/stanford-tagger-4.0.0/" from nltk.tag.stanford import StanfordPOSTagger as POS_Tag arabic_postagger = POS_Tag(stanford_tagger_path+'models/arabic.tagger', stanford_tagger_path+'/stanford-postagger.jar') text_list_pos = [arabic_postagger.tag(inner_word_list) for inner_word_list in text_list] if remove_verbs==True: pos_to_remove = ["VB","VBD","VBG","VBN","VBP","VBZ"] text_list_final = [] for inner_list in text_list_pos: final_inner_list = [] for pos_tuple in inner_list: # sometimes structure is unstable, so need to use find if pos_tuple[0].find("/")>=0: idx=0 else: idx=1 if pos_tuple[idx].split("/")[1] not in pos_to_remove: final_inner_list.append(pos_tuple[idx].split("/")[0]) text_list_final.append(final_inner_list) else: text_list_final = text_list return text_list_final
def getPOSTags(sentence): """Generate POS tags with Stanford POS tagger. Use Standford POS tagger as annotation model. https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html Args: sentence: a sequence of sentence text Returns: tags: POS tags """ pos_path = config['POS_MODEL_PATH'] pos_jar = config['POS_JAR_PATH'] postagger = StanfordPOSTagger(pos_path, pos_jar) word_tags = postagger.tag(sentence.split()) tags = [list(t) for t in zip(*word_tags)][1] return tags
def pos_tag(series): import nltk def rem_mentions_hasht(tweet): words = tweet.split() relevant_tokens = [w for w in words if '@' not in w and '#' not in w] return( " ".join(relevant_tokens)) series = series.apply(lambda tweet: rem_mentions_hasht(tweet)) from nltk.tag.stanford import StanfordPOSTagger import os java_path = "C:/Program Files/Java/jre1.8.0_111/bin/java.exe" os.environ['JAVAHOME'] = java_path english_postagger = StanfordPOSTagger(os.getcwd()+'\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger' , os.getcwd()+'\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar') return series.apply(lambda a: english_postagger.tag(nltk.word_tokenize(a)))
def review_tager(tokenized_reviews): st_model_path = r'SPOST/models/english-bidirectional-distsim.tagger' st = StanfordPOSTagger(st_model_path, r'SPOST/stanford-postagger.jar') results = [] errors = [] count = 0 for review in tokenized_reviews: try: results.append(st.tag(review)) count += 1 except: print(count) errors.append(count) results.append(review) count += 1 print('errors for the following indexes\n', errors) return results
def get_pos_sentence(sentences_spans,pos_vocab): """ Get POS tags for each sentence. (needed to build end2end system) :param start: :param end: :return: """ #raw_dir_simple = read.read_from_json('test/test_dir_simple') #### in folder data/ #raw_dir_simple = read.read_from_json('clinical_data/train_samples1_simples') #raw_dir_simple = read.read_from_json('agriculture_data/raw_dir_simple') #raw_dir_simple = ["NYT19980206.0466"] english_postagger = StanfordPOSTagger( StandforParser, #### in folder data/ StandforParser_jar) #### in folder data/ english_postagger.java_options = '-mx8000m' pos_sentences = list() for sent_span in sentences_spans: print(sent_span[0]) text = nltk.word_tokenize(sent_span[0]) text_pos = english_postagger.tag(text) #####StanfordPnOSTagger failed to tag the underscore, see https://github.com/nltk/nltk/issues/1632 if use nltk 3.2.2, please change the code "word_tags = tagged_word.strip().split(self._SEPARATOR)" in function "parse_outputcode" of nltk.standford.py to "word_tags = tagged_word.strip().rsplit(self._SEPARATOR,1)" to handle undersocre issues index = 0 for token in text_pos: # if (text[index] != token[0]) and (token[0] == '``' or token[0] == "''"): ######### deal with the double quotes, in nltk.tokenize treebank.py change the tokenizer for double quotes. Reasons: (double quotes (") are changed to doubled single forward- and backward- quotes (`` and '')) # text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] == "``" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "``"] if text[index] ==token[0] and token[0] == "''" and text[index] not in sent_span[0]: text_pos[index] = ["\"", "\'\'"] if text[index] == token[0] and token[0] in ['{','(','['] : text_pos[index] = [token[0],"("] if text[index] == token[0] and token[0] in ['}',')',']']: text_pos[index] = [token[0],")"] pos_vocab[token[1]]+=1 index+=1 pos_sentences.append(text_pos) return pos_sentences,pos_vocab
''' Created on Mar 11, 2016 @author: zhongzhu ''' import os from nltk.parse.stanford import StanfordDependencyParser from nltk.parse.stanford import StanfordParser from nltk.tag import StanfordNERTagger from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger('english-bidirectional-distsim.tagger') st.tag('What is the airspeed of an unladen swallow ?'.split()) st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")]
os.environ['STANFORD_MODELS'] = \ 'C:/stanford_data/stanford-parser-3.5.2-models.jar' parser = stanford.StanfordParser(model_path= \ "C:/stanford_data/englishPCFG.ser.gz") parsed_sentences = parser.raw_parse( \ (my_sentence)) for i in parsed_sentences: for k in i: print(k) # GUI for line in parsed_sentences: for sentence in line: sentence.draw() sys.exit() st = StanfordPOSTagger(r'C:/stanford_data/english-bidirectional-distsim.tagger',r'C:/stanford_data/stanford-postagger.jar') bobo = st.tag(my_sentence.split()) print(bobo) for i in bobo: print(i)
class NltkHelper: def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(__file__)) os.environ["STANFORD_PARSER"] = root+"/stanford-postagger/stanford-postagger.jar" os.environ["STANFORD_MODELS"] = root+"/stanford-postagger/models/" _path_to_model = root + '/stanford-postagger/models/english-bidirectional-distsim.tagger' _path_to_jar = root + '/stanford-postagger/stanford-postagger.jar' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag( word_tokenize(text.lower()) )) #cleanWords = self.cleanWords() #self.tags = self.stringifyTuples(self.stanford.tag( cleanWords )) #print self.cleanWords() self.taggedBigrams = self.ngramsAndTags(2) #print self.words #print self.cleanWords() #print "Bigrams --> ", self.taggedBigrams #print "Tags --> ", self.findTags() #print (nouns) def personal_names(self): output = [] #(('reports', 'NNS'), ('claim', 'VBP')) for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isPersonalName( tag1 ) and self.isPersonalName( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isPersonalName(self, tag): return tag == "NNP" or tag == "FW" def preprocessTitle(self): output = '' for taggedWord in self.tags: word = taggedWord[0] tag = taggedWord[1] if self.isPersonalName(tag): output = "{0} {1}".format(output, word.title()) else: output = "{0} {1}".format(output, word.lower()) return output def ngramsAndTags(self, n): output = [] for i in range(len(self.tags)-n+1): gram = (self.tags[i],) for j in range(i+1, i+n): gram += ( self.tags[j], ) output.append( gram ) return output def sortFrequencies( self, ngram ): return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True) def findTags(self): #pattern = [("AJ", NOUN/S/FWS), (FW, FW), NOUN, NOUN] output = [] #(('reports', 'NNS'), ('claim', 'VBP')) for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isAdj(self, tag): return tag=='JJ' def isNounOrForeignWord(self, tag): nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] return tag in nouns """ def bigramsList(self): pass """ def stringifyList(self, list): output = [] for tag in list: output.append( str(tag.encode('utf-8')) ) return output def stringifyTuples(self, tuples): output = [] for tag in tuples: output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) ) return output """ returns list of tuples of tagged words in text """ def analyze(self): output = [] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize( sentence.lower() ) ) output.append(taggedWords) return self.stringifyTuples(taggedWords) """ returns list of nouns and foreign words """ def filterNounsInText(self): output = set() nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) return self.stringifyList( list(output) ) def cleanWords(self): input = '' for item in self.words: input = "{0} {1}".format(input, item) input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input) input.decode('ascii', 'ignore') input = input.split(" ") cleanInput = [] for item in input: item = item.strip( string.punctuation ) if len(item)>1 or (item.lower()=='a' or item.lower()=='i'): cleanInput.append( item ) return cleanInput def bigramNouns(self, text): nouns = self.filterNouns(text) def isTagNounOrForeignWord(self, word): output = False nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] taggedWords = self.stanford.tag( word.lower() ) for item in taggedWords: if item[1] in nouns: output = True break return output @staticmethod def filterNouns(self, input): output = set() nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] sentences = sent_tokenize(input) for sentence in sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) nList = list(output) return self.stringifyTuples(nList) @staticmethod def define( self, word ): definitions = [] try: synsets = wn.synsets(word) for synset in synsets: definitions.append (synset.definition()) except ValueError: print "Cannot define '{0}'".format(word) return definitions def sentenceExamples( self, noun): output = [] try: synsets = wn.synsets(noun) for synset in synsets: examples = synset.examples() for example in examples: output.append( example ) except ValueError, AttributeError: print "Cannot find any example for '{0}'".format(noun) return output
class NLTKHelper(object): """docstring for NLTKHelper""" def __init__(self, text): reload(sys) sys.setdefaultencoding('utf8') self.text = text root = os.path.dirname(os.path.realpath(_file_)) os.environ["STANFORD_PARSER"] = root+ os.environ["STANFORD_MODELS"] = root+ _path_to_model = root + '' _path_to_jar = root + '' self.stanford = StanfordPOSTagger(_path_to_model, _path_to_jar) self.sentences = sent_tokenize(text.encode("utf-8")) self.words = word_tokenize(text.encode("utf-8")) self.tags = self.stringifyTuples(self.stanford.tag(word_tokenize(text.lower()))) #cleanWords self.taggedBigrams = self.ngramsAndTags(2) #print self.words def personal_names(self): output = [] for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isPersonalName(tag1) and self.isPersonalName(tag2): output.append("{0} {1}".format(word1, word2)) return output def isPersonalName(self, tag): return tag == "NNP" or tag == "FW" def preprocessTitle(self): output = '' for taggedWord in self.tags: word = taggedWord[0] tag = taggedWord[1] if self.isPersonalName(tag): output = "{0} {1}".format(output, word.title()) else: output = "{0} {1}".format(output, word.lower()) return output def ngramsAndTags(self, n): output = [] for i in range(len(self.tags)-n+1): gram = (self.tags[i],) for j in range(i+1, i+n): gram +=(self.tags[j], ) output.append(gram) return output def sortFrequencies(self, ngram): return sorted(ngram.items(), key = operator.itemgetter(1), reverse=True) def findTags(self): output = [] for gram in self.taggedBigrams: tag1 = gram[0][1] tag2 = gram[1][1] word1 = gram[0][0] word2 = gram[1][0] if self.isAdj( tag1 ) and self.isNounOrForeignWord( tag2 ) or self.isNounOrForeignWord( tag1 ) and self.isNounOrForeignWord( tag2 ): output.append( "{0} {1}".format(word1, word2) ) return output def isAdj(self, tag): return tag=='JJ' def isNounOrForeignWord(self, tag): nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] return tag in nouns def stringifyList(self, list): output = [] for tag in list: output.append( str(tag.encode('utf-8')) ) return output def stringifyTuples(self, tuples): output = [] for tag in tuples: output.append( (str(tag[0].encode('utf-8')), str(tag[1].encode('utf-8'))) ) return output #returns list of tuples of tagged words in text def analyze(self): output = [] for sentence in self.sentences: taggedWords = self.stanford.tag(word_tokenize(sentence.lower())) output.append(taggedWords) return self.stringifyTuples(taggedWords) def filterNounsInText(self): output = set()nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] for sentence in self.sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) return self.stringifyList( list(output) ) def cleanWords(self): input = '' for item in self.words: input = "{0} {1}".format(input, item) input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input) input.decode('ascii', 'ignore') input = input.split(" ") cleanInput = [] for item in input: item = item.strip( string.punctuation ) if len(item)>1 or (item.lower()=='a' or item.lower()=='i'): cleanInput.append( item ) return cleanInput def bigramNouns(self, text): nouns = self.filterNouns(text) def isTagNounOrForeignWord(self, word): output = False nouns = ['NN', 'NNS', 'NNP', 'NNPS', 'FW'] taggedWords = self.stanford.tag( word.lower() ) for item in taggedWords: if item[1] in nouns: output = True break return output @staticmethod def filterNouns(self, input): output = set() nouns = ['NN', 'NNS', 'NNPS', 'FW'] sentences = sent_tokenize(input) for sentence in sentences: taggedWords = self.stanford.tag( word_tokenize(sentence.lower() ) ) for item in taggedWords: if item[1] in nouns: output.add( item[0] ) nList = list(output) return self.stringifyTuples(nList) @staticmethod def define(self, word): definitions = [] try: synsets = wn.synsets(word) for synset in synsets: definitions.append(synset.definition()) except ValueError: print "Cannot define '{0}'".format(word) except definitions
wordsSplit = sent1.split(" ") ## Feature 1: Sentence Length length=len(wordsSplit) class_arrays.append(length) for f in range(0,length): wordsClean=wordsSplit[f] if "&_" in wordsClean: target1=wordsClean.translate(string.maketrans("",""), string.punctuation) wordsSplit[f]=wordsClean break ## choose tagger and tag sentence sentClean=str(wordsSplit) sentTagged=st.tag(sentClean) ## Feature 2: Completeness (capital word initial pos, punct. mark final) if wordsSplit[0][0].isupper() and (sent1.endswith(".") or sent1.endswith("!") or sent1.endswith("?")) : comp=1 class_arrays.append(comp) ## Feaure 5: Complexity (Stanford): how deeply embedded is the sentence? ##parse sentence with the Stanford parser parse=list(parser.raw_parse(sentClean.decode("utf-8"))) sentParse=str(parse).split(" ") for i in range(0, len(sentParse)): if "Tree('S'" in sentParse[i]: complexity=complexity+1
class SenticParser: def __init__(self): self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') def TaggedSentenceSlashForm(self, sentence ): #print sentence.split() Tagged = self.st.tag(sentence.split()) TaggedSentence = "" for i in Tagged: TaggedSentence = TaggedSentence+"/".join(i)+" " #print TaggedSentence return TaggedSentence def TaggedSentence(self, sentence ): Tagged = self.st.tag(sentence.split()) return Tagged def FindStemmedVerb(self, word): st = LancasterStemmer() StemmedVerb = st.stem(word) dic = enchant.Dict("en_US") if( dic.check(StemmedVerb) ): return StemmedVerb else: return StemmedVerb+"e" def FindSplit(self, sentence, TaggedSentence): TokenizedSentence = nltk.word_tokenize(sentence) SplitList = [] SentAdded = "" split = 0 #print TaggedSentence for i in range(len(TaggedSentence)): if TaggedSentence[i][1].startswith("VB"): SplitList.append(SentAdded) try: if (TaggedSentence[i+1][1].startswith("VB")): SentAdded = "" else: SplitList.append(SentAdded) SentAdded = TaggedSentence[i][0]+" " # print "split" except: SplitList.append(TaggedSentence[i][0]) else: #print SentAdded SentAdded = SentAdded + TokenizedSentence[i] + " " SplitList.append(SentAdded) Str_list = filter(None, SplitList) Str_list = list(set(Str_list)) ''' for i in range(len(Str_list)): Str_list[i] = Str_list[i][:-1].translate(string.maketrans("",""), string.punctuation) ''' return Str_list
from nltk import pos_tag,word_tokenize #from Utils import getQues #txt=getQues() #txt="benim adim yahya" from nltk.tag.stanford import StanfordPOSTagger txt="i am dentist" tgr=StanfordPOSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') print tgr.tag(word_tokenize(txt))
class Parser: def __init__(self): self.MatchList = [] self.ConceptMatches = [] self.st = StanfordPOSTagger('stanford-postagger/models/english-bidirectional-distsim.tagger', 'stanford-postagger/stanford-postagger.jar') def SyntacticMatch(self, concept1, concept2 ): # Checks for syntactic similarity. Checks for matching words between two concepts. TaggedConcept1 = self.st.tag(nltk.word_tokenize(concept1)) TaggedConcept2 = self.st.tag(nltk.word_tokenize(concept2)) print TaggedConcept1 print TaggedConcept2 flag = 0 for i in TaggedConcept1: for j in TaggedConcept2: if (i == j): if i[1].startswith("NN"): flag = 1 if ( flag == 1): return True else: return False def FindBigrams(self, concept): # Finds All Bigrams associated with the concept #sentence = concept.split(" ") # Splits the Given concept into Bigrams e.g) "a very special christmas gift" gets split as ["a very", "very special", "special christmas", "christmas gift"] sentence = self.st.tag(nltk.word_tokenize(concept)) print sentence Bigrams = [] for i in range(len(sentence) - 1): if ( sentence[i][1] == "JJ" and sentence[i+1][0] in stopwords.words('english') ): # If the bigram is [ adj + stopword ] , ignore continue # bigrams like "a very" are ignored elif ( sentence[i][0] in stopwords.words('english') and sentence[i+1][0] in stopwords.words('english') ): # If the bigram is [ adj + stopword ] , ignore continue elif ( sentence[i+1][1] == "JJ" and sentence[i][0] in stopwords.words('english') ): # If the bigram is [ stopword + adj ] , ignore continue # bigrams like "amazingly a" is ignored elif ( sentence[i][1] == "JJ" and sentence[i+1][1].startswith("NN") ): # If the bigram is [ adj + concept ] , then include [adj + concept] and [concept] to the list Bigrams.append(sentence[i+1][0]) # e.g) "special christmas" --> concepts extracted will be "special christmas" and "christmas" are added Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0]) elif ( sentence[i][0] in stopwords.words("english") and sentence[i+1][1].startswith("NN") ): # If the bigram is [ stopword + concept ], then inlcude only the concept w/ and w/o the concept Bigrams.append(sentence[i+1][0]) # e.g) "the christmas" --> concepts that will be extracted is "christmas" , "the christmas" Bigrams.append(sentence[i][0]+" "+ sentence[i+1][0]) elif ( sentence[i][1].startswith("NN") and sentence[i+1][1] == "JJ" ): # If the bigram ends with adjective , then ignore the adjective. Bigrams.append(sentence[i][0]) # e.g) "present amazing" --> concept that will be extracted is "present" elif ( sentence[i][1].startswith("NN") and sentence[i+1][0] in stopwords.words("english")): # If the bigram ends with a stopword , then ignore the stopword Bigrams.append(sentence[i][0]) # e.g) "christmas the" --> concept that will be extracted is "christmas" else: Bigrams.append(sentence[i][0]+ " "+ sentence[i+1][0]) print Bigrams return Bigrams
############################## # Tokenization des Textes ############################## tokens = nltk.word_tokenize(text, language='german') #print(tokens) sentence_tokens = nltk.sent_tokenize(text, language='german') #print (sentence_tokens) # Auswahl eines zufälligen Übungssatzes randSentence = sentence_tokens[randint(0,len(sentence_tokens))] randSentenceTokens = nltk.word_tokenize(randSentence, language='german') # Auswahl des zu trainierenden Worttyps pos_sentence = st.tag(randSentence.split()) #print(pos_sentence) response = input("Welche Wortart wollen Sie trainieren? (Verb, Nomen, Adjektiv, Artikel)") if response == "Verb": picked_wordtype= "VAFIN" elif response == "Nomen": picked_wordtype = "NN" elif response == "Adjektiv": picked_wordtype = "ADJA" elif response == "Artikel": picked_wordtype = "ART" else: print("Nicht zugelassen") # Ausgewählten Worttyp im Übungssatz finden temp = list()
# -*- coding: utf-8 -*- import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.tokenize import word_tokenize #the path where you have downloaded and unziped the full parser. sp_dir = '/home/sarah/postagger/' english_model = sp_dir + 'models/english-bidirectional-distsim.tagger' chinese_model = sp_dir + 'models/chinese-distsim.tagger' jar_path = sp_dir + 'stanford-postagger.jar' #testing the english POS tagger print "For the English model" st_eng = StanfordPOSTagger(model_filename = english_model, path_to_jar = jar_path) eng_sent = 'This is Stanford postagger in nltk for Python users.' print eng_sent eng_tokens = word_tokenize(eng_sent) eng_tagged = st_eng.tag(eng_tokens) for i in eng_tagged: print i #testing for the chinese POS tagger print "\n\nFor the Chinese model" st_chi = StanfordPOSTagger(model_filename = chinese_model, path_to_jar = jar_path,encoding = 'utf-8') chi_sent = '这 是 在 Python 环境 中 使用 斯坦福 词性 标 器' print chi_sent chi_tokens = word_tokenize(chi_sent) chi_tagged = st_chi.tag(chi_tokens) for i in chi_tagged: print i #print st_chi.tag('这 是 在 Python 环境 中 使用 斯坦福 词性 标 器'.split())
class Parser(object): modeldir = os.path.abspath(BASE_DIR + "/weiss/planner/models/") stopword_path = modeldir + "/english.stp" def __init__(self): self._postagger = StanfordPOSTagger(self.modeldir + '/postagger/models/english-bidirectional-distsim.tagger', self.modeldir + '/postagger/stanford-postagger.jar') self._stemmer = nltk.SnowballStemmer("english") self._stopwords = stopword(self.stopword_path) self._type_words = self._set_type_words() self._sentiment = self._get_sentiment() def _get_sentiment(self): sentiment = {} for line in open(self.modeldir + "/AFINN.txt"): word, score = line.split('\t') sentiment[word] = int(score) return sentiment def calculate_sentiment(self, query): tokens = nltk.word_tokenize(query) score = 0 for token in tokens: if token in self._sentiment: score += self._sentiment[token] return score def entity_recognition(self, query, arguments): """Parse query and extract keywords This function is called in planner Args: query: query needs to be parsed arguments: info needs to be updated """ tokens = nltk.word_tokenize(query) tags = self._postagger.tag(tokens) tuples = [] for tag in tags: if tag[0] in self._stopwords: continue stemmed = self._stemmer.stem(tag[0]) if stemmed in self._type_words['movie']: continue if stemmed in self._type_words['article']: continue if stemmed in self._type_words['restaurant']: continue if tag[1][:2] == 'NN' or tag[1][:2] == 'JJ': tuples.append(tag[0]) if len(tuples) > 0: arguments['keywords'] = tuples logger.info("Here are the keywords: %s" % arguments['keywords']) def _set_type_words(self): """Initialize synonymy words of movie, article and restaurant This function is called during initialization Return: A dictionary, key: movie, article, restaurant, value: their synonymy words """ topic = {} movie = ['cinema', 'show', 'film', 'picture', 'cinematograph', 'videotape', 'flick', 'pic', 'cine', 'cinematics', 'photodrama', 'photoplay', 'talkie', 'flicker', 'DVD', 'movie'] article = ['report', 'announcement', 'story', 'account', 'newscast', 'headlines', 'press', 'communication', 'talk', 'word', 'communique', 'bulletin', 'message', 'dispatch', 'broadcast', 'statement', 'intelligence', 'disclosure', 'revelation', 'gossip', 'dispatch', 'news', 'article'] restaurant = ['bar', 'cafeteria', 'diner', 'dining', 'saloon', 'coffeehouse', 'canteen', 'chophouse', 'drive-in', 'eatery', 'grill', 'lunchroom', 'inn', 'food', 'pizzeria', 'hideaway', 'cafe', 'charcuterie', 'deli', 'restaurant'] for m in movie: topic.setdefault('movie', set([])) topic['movie'].add(self._stemmer.stem(m)) for a in article: topic.setdefault('article', set([])) topic['article'].add(self._stemmer.stem(a)) for r in restaurant: topic.setdefault('restaurant', set([])) topic['restaurant'].add(self._stemmer.stem(r)) return topic def type_recognition(self, query, arguments): """Identity the type of the topic: movie, article or restaurant This is called in planner Args: query: query needs to be parsed arguments: info needs to be updated """ tokens = nltk.word_tokenize(query) first = self._stemmer.stem(tokens[0]) last = self._stemmer.stem(tokens[-1]) lastsecond = self._stemmer.stem(tokens[-2]) if len(tokens) > 1 else "toy" if (first in self._type_words['article'] or last in self._type_words['article'] or lastsecond in self._type_words['article']): arguments['tid'] = Type.News elif (first in self._type_words['restaurant'] or last in self._type_words['restaurant'] or lastsecond in self._type_words['restaurant']): arguments['tid'] = Type.Restaurant elif (first in self._type_words['movie'] or last in self._type_words['movie'] or lastsecond in self._type_words['movie']): arguments['tid'] = Type.Movie else: arguments['tid'] = Type.Unknown @staticmethod def _string_to_idx(number): if number == 'first' or number == 'one': return 0 if number == 'second' or number == 'two': return 1 if number == 'third' or number == 'three': return 2 if number == 'fourth' or number == 'four': return 3 if number == 'fifth' or number == 'five': return 4 @staticmethod def keyword_matching(arguments, entities): words = arguments['keywords'] phonics = set([]) overlap = [] for w in words: phonics.add(fuzzy.nysiis(w)) for i in xrange(0, len(entities)): entity_name = nltk.word_tokenize(entities[i].name) entity_phonics = set([]) for word in entity_name: entity_phonics.add(fuzzy.nysiis(word)) common = len(phonics & entity_phonics) / len(entity_phonics) if common == 1: arguments['idx'] = i return overlap.append(common) arguments['idx'] = overlap.index(max(overlap)) def find_number(self, query, arguments, entities): tokens = nltk.word_tokenize(query) tags = self._postagger.tag(tokens) last = query.find('last') # Edge case, "first" cannot be tagged correctly if len(query.split(" ")) <= 3 and query.find('first') != -1: arguments['idx'] = 0 return number = None for t in tags: if t[1] == 'JJ' and t[0][-2:] in set(['th', 'nd', 'st', 'rd']): number = t[0] break elif t[1] == 'CD' and t[0]: number = t[0] if number.isdigit() and int(number) < 6: arguments['idx'] = int(number) - 1 return break if number is not None: if last == -1: arguments['idx'] = self._string_to_idx(number) else: arguments['idx'] = len(entities) - self._string_to_idx(number) - 1
from nltk.tag.stanford import StanfordPOSTagger import nltk import os os.environ['CLASSPATH'] = "/home/vishesh/Downloads/stanford-postagger-full-2015-12-09/" english_postagger = StanfordPOSTagger('models/english-bidirectional-distsim.tagger') print english_postagger.tag(nltk.word_tokenize('this is stanford postagger in nltk for python users')) fo = open('europarl-v7.de-en.de','r') data = fo.read() fo.close() fw = open('europarl_tags_testing.txt','w') data = data.decode('utf-8') data = data.split('\n') #tokens = data.split() #print len(tokens) #print 'Tagging...' german_postagger = StanfordPOSTagger('/home/vishesh/Documents/NLP/postagger/models/german-fast-caseless.tagger') for i in range(10000,11500): tokens = nltk.word_tokenize(data[i]) tags = german_postagger.tag(tokens)