def pos_tagging(requirements): ''' POS-Tagging via Stanford POS tagger NOTE: This library creates a Java process in the background. Please make sure you have installed Java 1.6 or higher. sudo apt-get install default-jre sudo apt-get install default-jdk ''' _logger.info("Pos-tagging for requirements' tokens") ''' See: http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html -------------------------------------------------------------------------------------------- Tag Description Examples -------------------------------------------------------------------------------------------- CC conjunction, coordinating & 'n and both but either et for less minus neither nor or plus so therefore times v. versus vs. whether yet CD numeral, cardinal mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025 fifteen 271,124 dozen quintillion DM2,000 ... DT determiner all an another any both del each either every half la many much nary neither no some such that the them these this those EX existential there there FW foreign word gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte terram fiche oui corporis ... IN preposition or conjunction, subordinating astride among uppon whether out inside pro despite on by throughout below within for towards near behind atop around if like until below next into if beside ... JJ adjective or numeral,ordinal third ill-mannered pre-war regrettable oiled calamitous first separable ectoplasmic battery-powered participatory fourth still-to-be-named multilingual multi-disciplinary ... JJR adjective, comparative bleaker braver breezier briefer brighter brisker broader bumper busier calmer cheaper choosier cleaner clearer closer colder commoner costlier cozier creamier crunchier cuter ... JJS adjective, superlative calmest cheapest choicest classiest cleanest clearest closest commonest corniest costliest crassest creepiest crudest cutest darkest deadliest dearest deepest densest dinkiest ... LS list item marker A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005 SP-44007 Second Third Three Two \* a b c d first five four one six three two MD modal auxiliary can cannot could couldn't dare may might must need ought shall should shouldn't will would NN noun, common, singular or mass common-carrier cabbage knuckle-duster Casino afghan shed thermostat investment slide humour falloff slick wind hyena override subhumanity machinist ... NNP noun, proper, singular Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA Shannon A.K.C. Meltex Liverpool ... NNPS noun, proper, plural Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques Apache Apaches Apocrypha ... NNS noun, common, plural undergraduates scotches bric-a-brac products bodyguards facets coasts divestitures storehouses designs clubs fragrances averages subjectivists apprehensions muses factory-jobs ... PDT pre-determiner all both half many quite such sure this POS genitive marker ' 's PRP pronoun, personal hers herself him himself hisself it itself me myself one oneself ours ourselves ownself self she thee theirs them themselves they thou thy us PRP$ pronoun, possessive her his mine my our ours their thy your RB adverb occasionally unabatingly maddeningly adventurously professedly stirringly prominently technologically magisterially predominately swiftly fiscally pitilessly ... RBR adverb, comparative further gloomier grander graver greater grimmer harder harsher healthier heavier higher however larger later leaner lengthier less-perfectly lesser lonelier longer louder lower more ... RBS adverb, superlative best biggest bluntest earliest farthest first furthest hardest heartiest highest largest least less most nearest second tightest worst RP particle aboard about across along apart around aside at away back before behind by crop down ever fast for forth from go high i.e. in into just later low more off on open out over per pie raising start teeth that through under unto up up-pp upon whole with you TO "to" as preposition or infinitive marker to UH interjection Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly man baby diddle hush sonuvabitch ... VB verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... VBD verb, past tense dipped pleaded swiped regummed soaked tidied convened halted registered cushioned exacted snubbed strode aimed adopted belied figgered speculated wore appreciated contemplated ... VBG verb, present participle or gerund telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting erasing wincing ... VBN verb, past participle multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ... VBP verb, present tense, not 3rd person singular predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ... VBZ verb, present tense, 3rd person singular bases reconstructs marks mixes displeases seals carps weaves snatches slumps stretches authorizes smolders pictures emerges stockpiles seduces fizzes uses bolsters slaps speaks pleads ... WDT WH-determiner that what whatever which whichever WP WH-pronoun that what whatever whatsoever which who whom whosoever WP$ WH-pronoun, possessive whose WRB Wh-adverb how however whence whenever where whereby whereever wherein whereof why See: https://www.sketchengine.co.uk/german-stts-part-of-speech-tagset/ -------------------------------------------------------------------------------------------- Tag Description Example -------------------------------------------------------------------------------------------- ADJA attributive adjective (including participles used adjectivally) das große Haus die versunkene Glocke ADJD predicate adjective; adjective used adverbially der Vogel ist blau er fährt schnell ADV adverb (never used as attributive adjective) sie kommt bald APPR preposition left hand part of double preposition auf dem Tisch an der Straße entlang APPRART preposition with fused article am Tag APPO postposition meiner Meinung nach APZR right hand part of double preposition an der Straße entlang ART article (definite or indefinite) die Tante; eine Tante CARD cardinal number (words or figures); also declined zwei; 526; dreier FM foreign words (actual part of speech in original language may be appended, e.g. FMADV/ FM-NN) semper fidem ITJ interjection Ach! KON co-ordinating conjunction oder ich bezahle nicht KOKOM comparative conjunction or particle er arbeitet als Straßenfeger, so gut wie du KOUI preposition used to introduce infinitive clause um den König zu töten KOUS subordinating conjunction weil er sie gesehen hat NA adjective used as noun der Gesandte NE names and other proper nouns Moskau NN noun (but not adjectives used as nouns) der Abend PAV [PROAV] pronominal adverb sie spielt damit PAVREL pronominal adverb used as relative die Puppe, damit sie spielt PDAT demonstrative determiner dieser Mann war schlecht PDS demonstrative pronoun dieser war schlecht PIAT indefinite determiner (whether occurring on its own or in conjunction with another determiner) einige Wochen, viele solche Bemerkungen PIS indefinite pronoun sie hat viele gesehen PPER personal pronoun sie liebt mich PRF reflexive pronoun ich wasche mich, sie wäscht sich PPOSS possessive pronoun das ist meins PPOSAT possessive determiner mein Buch, das ist der meine/meinige PRELAT relative depending on a noun der Mann, dessen Lied ich singe […], welchen Begriff ich nicht verstehe PRELS relative pronoun (i.e. forms of der or welcher) der Herr, der gerade kommt; der Herr, welcher nun kommt PTKA particle with adjective or adverb am besten, zu schnell, aufs herzlichste PTKANT answer particle ja, nein PTKNEG negative particle nicht PTKREL indeclinable relative particle so PTKVZ separable prefix sie kommt an PTKZU infinitive particle zu PWS interrogative pronoun wer kommt? PWAT interrogative determiner welche Farbe? PWAV interrogative adverb wann kommst du? PWAVREL interrogative adverb used as relative der Zaun, worüber sie springt PWREL interrogative pronoun used as relative etwas, was er sieht TRUNC truncated form of compound Vor- und Nachteile VAFIN finite auxiliary verb sie ist gekommen VAIMP imperative of auxiliary sei still! VAINF infinitive of auxiliary er wird es gesehen haben VAPP past participle of auxiliary sie ist es gewesen VMFIN finite modal verb sie will kommen VMINF infinitive of modal er hat es sehen müssen VMPP past participle of auxiliary sie hat es gekonnt VVFIN finite full verb sie ist gekommen VVIMP imperative of full verb bleibt da! VVINF infinitive of full verb er wird es sehen VVIZU infinitive with incorporated zu sie versprach aufzuhören VVPP past participle of full verb sie ist gekommen ''' pos_tags_black_list = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RP', 'TO', 'UH', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] #pos_tags_black_list = ['CC', 'CD', 'DT', 'EX', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RP', 'TO', 'UH', 'WDT', 'WP', 'WP$', 'WRB'] existing_stanford_pos_tags = set() removed_stanford_tokens = set() # Note: "-mx30g" sets java's max memory size to 30 GB RAM # Please change when experiencing OS-related problems! pos_tagger = StanfordPOSTagger(pos_tagger_data_path, pos_tagger_jar_path, java_options='-mx30g') def restore_german_umlauts(tokens): return map(lambda t: t.replace("ue", "ü").replace("oe", "ö").replace("ae", "ä"), tokens) for requirement in requirements: corrected_words = restore_german_umlauts(requirement.description_tokens) pos_tagged_requirement_tokens = pos_tagger.tag(corrected_words) #tagged_tokens = filter(lambda t: t[1] not in pos_tags_black_list, pos_tagged_requirement_tokens) #requirement.description_tokens_pos_tags = map(lambda t: t, tagged_tokens) requirement.description_tokens_pos_tags = map(lambda t: t, pos_tagged_requirement_tokens) #removed_stanford_tokens |= set(filter(lambda t: t[1] in pos_tags_black_list, pos_tagged_requirement_tokens)) #existing_stanford_pos_tags |= set(map(lambda t: t[1], pos_tagged_requirement_tokens)) corrected_words = restore_german_umlauts(requirement.title_tokens) pos_tagged_title_tokens = pos_tagger.tag(corrected_words) #tagged_tokens = filter(lambda t: t[1] not in pos_tags_black_list, pos_tagged_title_tokens) #requirement.title_tokens_pos_tags = map(lambda t: t, tagged_tokens) requirement.title_tokens_pos_tags = map(lambda t: t, pos_tagged_title_tokens)
def go_func(self): global c c = word_tokenize(str(text1.toPlainText()).lower()) st = StanfordPOSTagger('C:\stanford-postagger-2015-12-09\models\english-bidirectional-distsim.tagger') b = st.tag(c) #print(b) flag = 0 loopflag = 0 # decflag=0 # iniflag=0 # scanflag=0 # pflag=0 # aflag=0 # print(b) dict = {"VB": [], "NN": [], "JJ": [], "DT": [], "CC": [], "PR": [], "CD": [], "IN": [], "RB": []} for (w,t) in b: if (t[:2] in dict): dict[t[:2]].append(w) print(dict) #checking if the statement is a declaration try: for i in dict["VB"]: for d in declarations: if d in i: dict["VB"].remove(i) #decflag = 1 flag = 1 self.declare(dict) break #if decflag == 1: if flag == 1: break #checking if the statement is an initialization if(flag == 0): #if(decflag == 0): for i in dict["VB"]: for ini in initialize: if ini in i: flag = 1 #print("hermoine") #iniflag = 1 self.init(dict) break #if iniflag == 1: if flag == 1: break if(flag == 0): #if(decflag == 0 and iniflag == 0): for i in dict["NN"] or dict["VB"]: for p in printer: if p in i: flag =1 #pflag = 1 self.prin(dict) break if flag == 1: #if pflag == 1: break #Scanning input from screen if(flag == 0): #if(decflag == 0 and iniflag == 0 and pflag == 0): for i in dict["VB"] or dict["NN"]: for s in scanner: if s in i: #dict["VB"].remove(i) flag = 1 #scanflag = 1 self.scan(dict) break #if scanflag == 1: if flag == 1: break #if scanflag == 0: if flag == 0: if("take from user" in a or "take from screen" in a or "take in" in a): self.scan(dict) flag = 1 break #scanflag = 1 if(flag == 0): for i in dict["IN"] + dict["JJ"] + dict["CC"] + dict["RB"]: for cond in conditional: if cond in i: flag = 1 self.cond(dict) break if flag == 1: break #arithmetic operations #if(decflag==0 and iniflag==0 and pflag==0 and scanflag == 0): if(flag == 0): for i in dict["VB"] + dict["NN"] + dict["CC"]: for art in arithmetic: #print("here ") #print(a,i) if art in i: flag = 1 #aflag = 1 t = self.arithmo(dict) print(t) break if flag == 1: #if aflag==1: break #print(flag) except Exception: print("exception")
import nltk from nltk.tag.stanford import StanfordPOSTagger cale_model = "/home/t3rtius/Documents/cs/sla-master/sem1/1-nlp-opt/" + \ "stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \ "models/english-bidirectional-distsim.tagger" cale_jar_tagger = "/home/t3rtius/Documents/cs/sla-master/sem1/1-nlp-opt/" + \ "stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \ "stanford-postagger.jar" tagger = StanfordPOSTagger(cale_model, cale_jar_tagger) text = "There once was a prince and he lived in a castle " +\ "and his name was Prince Charming." cuvInProp = nltk.word_tokenize(text) morfo = tagger.tag(cuvInProp) print("Analiza morfologică este:") morfoDict = dict(morfo) for parte in morfoDict.values(): print(parte)
for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk # Alternatively to setting the CLASSPATH add the jar and model via their path: jar = 'C:/stanford-postagger-2015-04-20/stanford-postagger.jar' model = 'C:/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8') jar = 'C:/stanford-ner-2015-04-20/stanford-ner.jar' model = 'C:/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz' ner_tagger = StanfordNERTagger(model, jar, encoding='utf8') posTagger = pos_tagger nerTagger = ner_tagger sentence = "Where can I find some bubble tea??" def queryGenrator(sentence): #find words in quotes quoted = re.findall(r'"([^"]*)"', sentence)
#!usr/bin/env python #-*- coding:utf-8 _*- """ @author:fonttian @file: Stanford04.py @time: 2017/09/26 """ from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger( '/home/fonttian/NLP/stanford-postagger-full-2015-12-09/models/chinese-distsim.tagger', "/home/fonttian/NLP/stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar" ) print(st)
##export CLASSPATH=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/stanford-postagger.jar:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/stanford-ner.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar ##export STANFORD_MODELS=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/models:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/classifiers from nltk.tag.stanford import StanfordPOSTagger from nltk.parse.stanford import StanfordParser from nltk.corpus import stopwords print("Sentence segmentation") tokens = "this is pune.Pune is a great city" tokens = tokens.split(".") print(tokens) print("\nTokenizer:") tokens = "this is pune" tokens = tokens.split(" ") print(tokens) print("\nStop Words Removal:") stop_words = set(stopwords.words('english')) filtered_words = [w for w in tokens if not w in stop_words] print(filtered_words) st = StanfordPOSTagger('english-bidirectional-distsim.tagger') print("\nPOS tagging:") print(st.tag('What is the airspeed of an unladen swallow ?'.split())) parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") print("\nSyntax Parser:") print(list(parser.raw_parse("rahul daksh fire")))
#word2vec(どのくらい似てるか) import gensim.downloader as api import sys from voice_common_pkg.srv import GgiLearning from voice_common_pkg.srv import GgiLearningResponse from nltk.tag.stanford import StanfordPOSTagger import rospy import random file_path = path.expanduser('~/catkin_ws/src/voice_common_pkg/config') minimum_value = 0.5 #コサイン類似度の最低値 #ベクトル読み込み word_vectors = api.load("glove-twitter-200") #nltkのモデルを読み込む pos_tag = StanfordPOSTagger( model_filename=file_path + "/stanford-postagger/models/english-bidirectional-distsim.tagger", path_to_jar=file_path + "/stanford-postagger/stanford-postagger.jar") class GgiTest(): def __init__(self): #ベクトル読み込み print('Wahing for tts and stt_server') rospy.wait_for_service('/tts') rospy.wait_for_service('/stt_server') print('test_phase is ready') self.stt = rospy.ServiceProxy('/stt_server', SpeechToText) self.tts = rospy.ServiceProxy('/tts', TTS) self.server = rospy.Service('/test_phase', GgiLearning, self.main) def main(self, req):
def main(start, stop): start_time = time.time() print 'start stop = ', start, stop stanford_dir = '.\data\stanford-postagger-full-2015-04-20\\' modelfile = stanford_dir + 'models\english-bidirectional-distsim.tagger' jarfile = stanford_dir + 'stanford-postagger.jar' tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile) # sentence = 'How can I increase the speed of my internet connection while using a VPN?' # print tagger.tag(sentence.split()) # for word, tag in tagger.tag(nltk.word_tokenize(sentence)): # print 'word: ',word,'\ttag: ',tag count = 0 real_matches = 0 correct_matches = 0 wrong_matches = 0 with open(".\data\quora_duplicate_questions.tsv") as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") #print 'inside thread call....start = ',start,"\tstop = ",stop #print 'inside run = ', (stop/100) for line in tsvreader: count = count + 1 #print 'count = ', count if count<start: continue if count>stop: break print "-------------------------------" print "count: ", count print "-------------------------------" duplicate = line[5] print'duplicate',duplicate if (duplicate == '1'): real_matches = real_matches + 1 q1 = tagger.tag(nltk.word_tokenize(line[3].lower())) q2 = tagger.tag(nltk.word_tokenize(line[4].lower())) q1_nouns = dict() for word, tag in q1: if tag[:2] == 'NN': q1_nouns[word] = 1 q2_nouns = dict() for word, tag in q2: if tag[:2] == 'NN': q2_nouns[word] = 1 print 'nouns q1: ', q1_nouns,'\tnouns q2: ', q2_nouns noun_match = False for key, value in q1_nouns.items(): if (key in q2_nouns): noun_match = True break print 'noun_match: ', noun_match if (noun_match): q1_verbs = dict() for word, tag in q1: if tag[:2] == 'VB': q1_verbs[word] = 1 q2_verbs = dict() for word, tag in q2: if tag[:2] == 'VB': q2_verbs[word] = 1 print 'verbs q1: ',q1_verbs,'\tverbs q2: ', q2_verbs verb_match = False if len(q1_verbs) == 0 or len(q2_verbs) == 0: verb_match = True continue for key, value in q1_verbs.items(): if (key in q2_verbs): verb_match = True break if (verb_match == False and len(wn.synsets(key))>0): syn = wn.synsets(key)[0] for lemma in syn.lemmas(): print 'synonym of ', key, ": ", lemma.name() if (lemma.name() in q2_verbs): verb_match = True break if (verb_match == False) : for key, value in q2_verbs.items(): if (key in q1_verbs): verb_match = True break if (verb_match == False and len(wn.synsets(key))>0): print key syn = wn.synsets(key)[0] for lemma in syn.lemmas(): print 'synonym of ', key, ": ", lemma.name() if (lemma.name() in q1_verbs): verb_match = True break if (verb_match): #print ' ' if (line[5] == '1'): print 'DUPLICATE' correct_matches = correct_matches + 1; else: print 'DUPLICATE BUT WRONG!!!!!' wrong_matches = wrong_matches + 1; print 'correct_matches : ', correct_matches print 'wrong_matches : ', wrong_matches print 'real_matches : ', real_matches print('\n\nrun time : ', time.time() - start_time)
from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger( 'edu\\stanford\\nlp\\models\\pos-tagger\\english-bidirectional\\english-bidirectional-distsim.tagger' )
from ansWhat import ansWhat from ansWhere import ansWhere from ansWhen import ansWhen from ansHow import ansHow from ansYesNo import ansYesNo from ansWhy import ansWhy from ansYesNo import intersection lemmatizer = WordNetLemmatizer() # Use stanford pos for lemmatization here # The Pos Tagger in NLTK will recognize words like "sits", "walks" as "NNS" instead of verb. stanford_pos = '../stanford/stanford-postagger-full-2015-04-20/' stanford_pos_model = stanford_pos + 'models/english-left3words-distsim.tagger' stanford_pos_jar = stanford_pos + 'stanford-postagger.jar' st_pos = StanfordPOSTagger(model_filename=stanford_pos_model, path_to_jar=stanford_pos_jar) # # NER Tagging: stanford_ner = '../stanford/stanford-ner-2015-04-20/' stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz' stanford_ner_jar = stanford_ner + 'stanford-ner.jar' ner = StanfordNERTagger(model_filename=stanford_ner_model, path_to_jar=stanford_ner_jar) # Set up the stanford PCFG parser stanford_parser_dir = '../stanford/stanford-parser-full-2015-04-20/' eng_model_path = stanford_parser_dir + "englishPCFG.ser.gz" my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar" my_path_to_jar = stanford_parser_dir + "stanford-parser.jar" parser = StanfordParser(model_path=eng_model_path, path_to_models_jar=my_path_to_models_jar,
questions_features = os.path.join("..", "data", "questions_features.tsv") pickle_SVM_model = os.path.join("..", "package", "model_SVM.pickle") pickle_SVM_vector = os.path.join("..", "package", "vector_SVM.pickle") questions_list = [] labels = [] with open(questions_file, 'r') as filename: data = csv.reader(filename, delimiter=',') for line in data: questions_list.append(line[0]) labels.append(line[1]) # print type(questions_list), type(labels) tagger_path = r"/Users/avaniarora/Desktop/stanford-postagger-2016-10-31/models/english-bidirectional-distsim.tagger" pos_jar = r"/Users/avaniarora/Desktop/stanford-postagger-2016-10-31/stanford-postagger.jar" tagger = StanfordPOSTagger(tagger_path, pos_jar) tagger.java_options = '-mx4096m' ### Setting higher memory limit for long sentences wh_tags = ['WP', 'WDT', 'WP$', 'WRB'] important_tags = ['VB', 'VBP', 'VBZ', 'VBD', 'VBG', 'VBN'] def get_wh_words_from_question(question): sentence = word_tokenize(question) pos_tags = nltk.pos_tag(sentence) entities = {} for pos_tag in pos_tags: if (pos_tag[1] in wh_tags):
}[t.lang](tweet_text)) # So it turns out that Python doesn't have switch-cases because, you guessed it, Guido is a total f*****g melon. # The mapping inside the `update` call is an equivalent structure, something I learned from working with old # Javascript code and passing around functions global tweet_counter global num_tweets logging.info(f"Analyzed tweet [{tweet_counter}/{num_tweets}]") tweet_counter += 1 return t.id, row german_tagger = StanfordPOSTagger( model_filename=(tagger_path + 'models/german-hgc.tagger'), path_to_jar=(tagger_path + 'stanford-postagger.jar') ) def _analyze_de(text): tags = german_tagger.tag(word_tokenize(text, language='german')) counter = collections.Counter([x[1] for x in tags]) return { # we need to map the STTS (German) tagset to a subset of the French tagset, so that we can compare them all 'ADJ': counter['ADJA'] + counter['ADJD'], 'ADV': counter['ADV'], 'CC': counter['KON'] + counter['KOKOM'], 'CS': counter['KOUI'] + counter['KOUS'], 'ET': counter['FM'], 'I': counter['ITJ'], 'NC': counter['NN'], 'NP': counter['NE'], 'PREF': counter['APPO'] + counter['APPR'] + counter['APPRART'] + counter['APZR'],
from nltk.tag.stanford import StanfordPOSTagger englishPOStagger = StanfordPOSTagger('/home/deep/StanfordPosTagger/models/english-bidirectional-distsim.tagger','/home/deep/StanfordPosTagger/stanford-postagger.jar') java_path='/usr/bin/java' os.environ['JAVAHOME'] = java_path def updatePosDict(tokens,PosDict): tags = englishPOStagger.tag(tokens) for token,tag in tags: PosDict[tag] += 1 PosAgrees = defaultdict(float) PosDisagrees = defaultdict(float) PosDiscuss = defaultdict(float) NumDiscuss = 0.0 NumAgreed = 0.0 NumDisagreed = 0.0 for i,lst in enumerate(miniList): if lst[2] == 'agree': NumAgreed += 1.0 try: updatePosDict(bodyDict[lst[1]] , PosAgrees) except OSError: print (i) elif lst[2] == 'disagree': NumDisagreed += 1.0 try: updatePosDict(bodyDict[lst[1]] , PosDisagrees) except OSError: print (i) elif lst[2] == 'discuss':
from config import * import ServerPrint as sp import numpy as np """ This program would load the stanford POS tagger Moreover, it provide several function to parse the structure of the sentence """ # The parser .jar path jar = "/home/sunner/nltk_data/models/stanford-postagger-full-2015-12-09/stanford-postagger.jar" model = "/home/sunner/nltk_data/models/stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger" # Load the parser tagger = StanfordPOSTagger("english-bidirectional-distsim.tagger") sp.show("Finish POS tagger loading") # Variable sentences = "" parseString = "" word_2_pos = OrderedDict() __verb = "" __subject = "" __object = "" __value = "" wordEmbedded = np.array([[1, 0], [0, 1]]) def tag(record, string="I love you"): """ Main function to do the POS tagging
# ans = item[0] # return ans # return '' # NERTagger ner_dir = '/Users/yifan/Desktop/WSTA_project/stanford-ner-2018-02-27/' ner_jarfile = ner_dir + 'stanford-ner.jar' ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz' ner_tagger = StanfordNERTagger(model_filename=ner_modelfile, path_to_jar=ner_jarfile) pos_dir = '/Users/yifan/Desktop/WSTA_project/stanford-postagger-2018-02-27/' pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger' pos_jarfile = pos_dir + 'stanford-postagger.jar' pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile, path_to_jar=pos_jarfile) sents = [] # sents.append("Rami Eid is studying at Stony Brook University in NY") # sents.append("Elbert lives in Melbourne.") # sents.append("Banana is $6 per kilo this Monday in Sydney.") # sents.append("Elbert lives 100 kilometers away from Melbourne.") # sents.append("Elbert is enrolled in University of Melbourne .") # sents.append("Elbert in Melbourne.") # sents.append("Elbert's address is Unit 1004 50 Albert Road South Melbourne 3205.") # sents.append("The price of a bottle of water is $1") # sents.append("The event will be on Sunday.") # sents.append("The event will be on 1pm.") # sents.append("Elbert will submit the assignment by email.") # sents.append("Elbert will submit the assignment by email tomorrow.") # sents.append("Elbert will submit the assignment tomorrow.")
import nltk import os import re #java_path = "/usr/bin/java" #os.environ['JAVAHOME'] = java_path os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-oracle" #from nltk.tag.stanford import POSTagger from nltk import pos_tag from nltk import word_tokenize from nltk.tag.stanford import StanfordTagger from nltk.tag.stanford import StanfordPOSTagger from nltk.tag.stanford import StanfordNERTagger path_to_model = '/home/noura/stanford-postagger-full-2016-10-31/models/arabic.tagger' path_to_jar = '/home/noura/stanford-postagger/stanford-postagger.jar' #artagger.java_options='-mx4096m' artagger = StanfordPOSTagger(path_to_model, path_to_jar, encoding='utf8') artagger._SEPARATOR = '/' tagged_sent = artagger.tag( word_tokenize( u'ممتاز و جيد لطيف أشياء أفعال وهذا نص للتأكد فقط لا غير أنا تسلقت شجرة' )) print(tagged_sent)
import nltk from collections import Counter import timeit from nltk.tag.stanford import StanfordPOSTagger st = StanfordPOSTagger( '/home/saurav/Documents/postagger/models/english-bidirectional-distsim.tagger', '/home/saurav/Documents/postagger/stanford-postagger.jar') Prob = {} infile = open('probabilities.txt', 'r') text = infile.readlines() for sentence in text: keyValPair = sentence.split(":") Prob[keyValPair[0]] = float(keyValPair[1][:-1]) infile.close() def prob(sequence): if sequence in Prob.keys(): return Prob[sequence] else: return 0 def fairSCP(sequence): numerator = prob(sequence) * prob(sequence) sequence = sequence.split()
with open('english_lemma.pickle', 'wb') as f: pickle.dump(lemmatized_sent, f) english_sentence_structure={} #-------------for each sentence create a sentence structure------- for i in eng_sent: sentence=i english_sentence_structure[sentence]=[] #--------------add number of words to sentence structure-------- english_sentence_structure[sentence].append(len(sentence.split())) #--------------postag each sentence--------------------- distsim = 'english-bidirectional-distsim.tagger' post = 'stanford-postagger.jar' english_postagger = StanfordPOSTagger(distsim, post) postag = english_postagger.tag(sentence.split()) #-----------add postag to sentence structure------------- english_sentence_structure[sentence].append(list(postag)) #---------call fuction to get phrases--------------- SenToPhrase(postag) #---------store the sentence structure------------------------ with open('english_struct.pickle', 'wb') as handle: pickle.dump(english_sentence_structure, handle, protocol=pickle.HIGHEST_PROTOCOL)
# coding:utf-8 import nltk import json from nltk.tag.stanford import StanfordPOSTagger from collections import Counter from pycocotools.coco import COCO import matplotlib.pyplot as plt import numpy as np tagger = StanfordPOSTagger( model_filename='/workspace/english-bidirectional-distsim.tagger', path_to_jar='/workspace/stanford-postagger.jar') dataset_path = '../annotations/captions_ucm_total.json' semantic_path = '/data/UCM_captions/stanford_semantic_words.json' # dataset_path = '../annotations/captions_sydney_total.json' # semantic_path = '/data/Sydney_captions/stanford_semantic_words.json' # dataset_path = '../annotations/captions_rsicd_total.json' # semantic_path = '/data/RSICD/stanford_semantic_words.json' semantic_dict = dict() # tag_dict = dict() # count = Counter() # with open(dataset_path, 'r') as f: # data = json.load(f) # print(data['dataset']) # for img in data['images']: # imgid = img['imgid'] # # semantic_list = [] # for sent in img['sentences']: # # print(sent['tokens'])
# Import StanfordPOSTagger and StanfordNERTagger from nltk.tag.stanford from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger # Set JAVAHOME variable to the directory containin Java on your computer import os #path = "/usr/bin/java" java_path = '/Library/Internet Plug-Ins/JavaAppletPlugin.plugin/Contents/Home/bin/java' os.environ['JAVAHOME'] = java_path # Using Stanford POS Tagger # set the path for POS tagger: the jar file and the model pos_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger' pos_jar_path = '/Users/abhilashakumari/Downloads/textmining/stanford-postagger-full-2017-06-09/stanford-postagger.jar' # Initialize the tagger st_pos = StanfordPOSTagger(pos_model_path, pos_jar_path) st_pos.tag(word_tokenize(sent)) # Using Stanford NER Tagger # set the path for NER tagger: the jar file and the model ner_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz' ner_jar_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/stanford-ner.jar' st_ner = StanfordNERTagger(ner_model_path, ner_jar_path) sent_ne = st_ner.tag(word_tokenize(sent)) # Check out the 7 class tagged english.muc.7class.distsim.crf.ser.gz model for more flexibility. # It tags Currency, Location, Percentages along with Persons, Organizations etc. ner7_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz' st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)
from sklearn.cluster import KMeans, DBSCAN from sklearn.utils.linear_assignment_ import linear_assignment from utils import common from tokenize_and_normalize import convert_num, tokenize_and_pos_tagging #from utils.features import NGramVectorizer, DepNGramVectorizer import utils.features try: import cPickle as pickle except: import pickle from nltk.tag.stanford import StanfordPOSTagger TAGGER_DIR = '/home/shoetsu/downloads/stanford-postagger' tagger = StanfordPOSTagger( TAGGER_DIR + '/models/english-left3words-distsim.tagger', TAGGER_DIR + '/stanford-postagger.jar' ) KMEANS_STR = 'kmeans' DBSCAN_STR = 'dbscan' MODEL_NAME = 'cluster.model' CONFIG_NAME = 'config' NUM = common.NUM NUMBER = common.NUMBER NONE = '-' stop_words = set(['.', ',', '!', '?']) VOCAB_CONDITION = lambda x : True if set([NUM, NUM.lower(), NUMBER, NUMBER.lower()]).intersection(x) and not stop_words.intersection(set(x)) else False ##################################### ## Extraction
from nltk.tag.stanford import StanfordPOSTagger import os java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe" os.environ['JAVAHOME'] = java_path english_postagger = StanfordPOSTagger('D:/BtechProject/stanford-postagger/models/english-bidirectional-distsim.tagger' , 'D:/BtechProject/stanford-postagger/stanford-postagger.jar') sentence = "NANDAN SUKTHANKAR PRANAY SANKET DESHMUKH" print(english_postagger.tag(sentence.split())) #op_file = open("output.txt", "w") """ with open('student_corpus.txt') as fp: for line in fp: sentence = line.strip('\n') token_array = english_postagger.tag(sentence.split()) op_file.write("\n".join((str(elem) for elem in token_array))) print(sentence) """ #ct = CRFTagger() #print(ct.tag(text.split()))
from arguments import define_args from tqdm import tqdm nltk.download('punkt') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = define_args() if args.return_POS: from nltk.tag.stanford import StanfordPOSTagger pos_tagger = StanfordPOSTagger(args.pos_tagger_model, args.pos_tagger_jar) if args.return_NER: from nltk.tag.stanford import StanfordNERTagger ner_tagger = StanfordNERTagger(args.ner_tagger_model, args.ner_tagger_jar) if args.return_DEP: from nltk.parse.stanford import StanfordDependencyParser dep_parser = StanfordDependencyParser(args.dep_parser_model, args.dep_parser_jar) if args.return_CONST: import benepar benepar.download('benepar_en2') const_parser = benepar.Parser("benepar_en2") def get_constituency_path_to_root(tree, leaf_index): parented_tree = nltk.tree.ParentedTree.convert(tree)
### [4] register the path in your bashrc or whatever. The name must be STANFORD_TAGGER_PATH. ### For example, ### export STANFORD_TAGGER_PATH=/home/sanghoun/tools/stanford-postagger ### [5] either do [ $ source ~/.bashrc ] or ### logout and login again. import sys, os import nltk from nltk.tag.stanford import StanfordPOSTagger if "STANFORD_TAGGER_PATH" not in os.environ: print("STANFORD_TAGGER_PATH is not registered.") sys.exit() tagger = StanfordPOSTagger( os.environ['STANFORD_TAGGER_PATH'] + "/models/chinese-distsim.tagger", os.environ['STANFORD_TAGGER_PATH'] + "/stanford-postagger.jar", encoding='utf-8') while True: try: line = sys.stdin.readline() except (KeyboardInterrupt, IOError): break if not line or len(line.strip()) < 1: break sentence = line output = "" #print words.strip().split() words = sentence.split()
def create_listen_func(self): #btn2.setEnabled(True) #btn1.setEnabled(False) #text1.setText("") #listening with sr.Microphone(sample_rate = 16000) as source: audio = r.listen(source) print("Listening") try: lines = r.recognize_google(audio) print("you said " + lines) text1.setText(lines) except LookupError: print("Couldn't understand audio") text1.setText("Couldn't understand audio") c = word_tokenize(lines.lower()) st = StanfordPOSTagger('C:\stanford-postagger-2015-12-09\models\english-bidirectional-distsim.tagger') b = st.tag(c) #print(b) flag = 0 loopflag = 0 # decflag=0 # iniflag=0 # scanflag=0 # pflag=0 # aflag=0 #print(b) dict = {"VB": [], "NN": [], "JJ": [], "DT": [], "CC": [], "PR": [], "CD": [], "IN": [], "RB": []} for (w,t) in b: if (t[:2] in dict): dict[t[:2]].append(w) print(dict) #checking if the statement is a declaration try: for i in dict["VB"]: for d in declarations: if d in i: dict["VB"].remove(i) #decflag = 1 flag = 1 self.declare(dict) break #if decflag == 1: if flag == 1: break #checking if the statement is an initialization if(flag == 0): #if(decflag == 0): for i in dict["VB"]: for ini in initialize: if ini in i: flag = 1 #print("hermoine") #iniflag = 1 self.init(dict) break #if iniflag == 1: if flag == 1: break #Printing output to screen if(flag == 0): #if(decflag == 0 and iniflag == 0): for i in dict["NN"] or dict["VB"]: for p in printer: if p in i: flag =1 #pflag = 1 self.prin(dict) break if flag == 1: #if pflag == 1: break #Scanning input from screen if(flag == 0): #if(decflag == 0 and iniflag == 0 and pflag == 0): for i in dict["VB"] or dict["NN"]: for s in scanner: if s in i: #dict["VB"].remove(i) flag = 1 #scanflag = 1 self.scan(dict) break #if scanflag == 1: if flag == 1: break #if scanflag == 0: if flag == 0: if("take from user" in a or "take from screen" in a or "take in" in a): self.scan(dict) flag = 1 #scanflag = 1 #if statement if flag == 0: if "if" in c: self.cond(c) flag = 1 #else statement if flag == 0: for i in otherwise: if i in c: text3.append("else") code_file.write("else") flag = 1 if(flag == 0): if "continue" in a: text3.append("continue;") code_file.write("continue;\n") flag = 1 #Looping if(flag == 0): for i in loop: if i in a: flag = 1 self.loop(dict) break #arithmetic operations #if(decflag==0 and iniflag==0 and pflag==0 and scanflag == 0): if(flag == 0): for i in dict["VB"] + dict["NN"] + dict["CC"]: for art in arithmetic: #print(a,i) if art in i: flag = 1 #aflag = 1 self.arithmo(dict) break if flag == 1: #if aflag==1: break
from nltk.tag.stanford import StanfordPOSTagger import string #import tensorflow_hub as hub from bert_embedding import BertEmbedding bert = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_multilingual_cased') #print(bert('espanol')) #sentence = ("autralia espanol").split(' ') #embed = bert(sentence) #first_word = embed[0] #print(first_word[1]) # Assignment 4: NER # This is just to help you get going. Feel free to # add to or modify any part of it. tagger = StanfordPOSTagger('stanford-models/models/spanish.tagger', 'stanford-models/stanford-postagger.jar') punctuations = string.punctuation def get_bert_embeddings(sent): print('yep') def get_pos_tags(sent): return 0 def getfeats(word, o,tag): """ This takes the word in question and the offset with respect to the instance word """ #tagger = conll2002.tagged_words() #print(spanish_postagger.tag(word)) #print('the tag for '+word+' is '+tag)
import pandas as pd from nltk import word_tokenize from nltk import sent_tokenize from nltk.parse.stanford import StanfordParser from nltk.parse.stanford import StanfordDependencyParser from nltk.tag.stanford import StanfordPOSTagger from text_locations import transcripts java_path = "usr/bin/java" os.environ['JAVAHOME'] = java_path current_dir = os.path.dirname(os.path.abspath(__file__)) stanford_parser_dir = current_dir + '/stanford_NLP/stanford-postagger-full-2015-04-20' path_to_model = stanford_parser_dir + "/models/english-bidirectional-distsim.tagger" path_to_jar = stanford_parser_dir + "/stanford-postagger.jar" tagger = StanfordPOSTagger(path_to_model, path_to_jar) POS_TAGS = [ "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB" ] # Save document from dataframe to CSV def save_csv_file(document_df, database_location): cols = list(document_df.columns.values) cols.insert(0, cols.pop()) cols.insert(0, cols.pop())
import nltk from nltk import * nltk.internals.config_java(options='-xmx2G') from nltk.tag.stanford import StanfordPOSTagger from nltk.tokenize import word_tokenize # enter the path to your local Java JDK, under Windows, the path should look very similar to this example java_path = "/Library/Java/JavaVirtualMachines/jdk-14.jdk/Contents/Home/bin/" os.environ["JAVAHOME"] = java_path # enter the paths to the Stanford POS Tagger .jar file as well as to the model to be used stanford_dir = "/Users/josemedardotapiatellez/Downloads/stanford-tagger-4.0.0" modelfile = stanford_dir+"/models/spanish-ud.tagger.props" jarfile=stanford_dir+"/stanford-postagger.jar" pos_tagger = StanfordPOSTagger(modelfile, jarfile) # Tagging this one example sentence as a test: # this small snippet of text lets you test whether the tagger is running before you attempt to run it on a locally # stored file (see line 28) text = "Just a small snippet of text to test the tagger." # Tagging a locally stored plain text file: # as soon as the example in line 22 is running ok, comment out that line (#) and comment in the next line and # enter a path to a local file of your choice; # the assumption made here is that the file is a plain text file with utf-8 encoding # text = open("C:/Users/Public/projects/python101-2018/data/sample-text.txt").read() # nltk word_tokenize() is used here to tokenize the text and assign it to a variable 'words' words = nltk.word_tokenize(text) # print(words)
def __init__(self, model_filename, jarfile): self.model_filename = model_filename self.path_to_jar = jarfile self.tager = StanfordPOSTagger(model_filename=self.model_filename, path_to_jar=self.path_to_jar)
import sys sys.path.append('/data/rumor_detection/rumor_detection') import json import logging from os import listdir from os.path import isdir, join, isfile import threading from nltk.tag.stanford import StanfordPOSTagger from src.utils import config from src.utils import text_utils text_processor = text_utils.create_text_processor() stanford_tagger = StanfordPOSTagger( model_filename= '../../libs/stanford_postagger/models/english-bidirectional-distsim.tagger', path_to_jar='../../libs/stanford_postagger/stanford-postagger.jar') writer = open('../../data/interim/tweet_stanford_pos_tag.txt', 'w') def load_data(data_path): for f in listdir(data_path): topic_dir = join(data_path, f) if isdir(topic_dir): rumor_dir = join(topic_dir, 'rumours') non_rumor_dir = join(topic_dir, 'non-rumours') read_topic_dir(rumor_dir) read_topic_dir(non_rumor_dir)