Пример #1
0
def pos_tagging(requirements):
    '''
        POS-Tagging via Stanford POS tagger
        NOTE: This library creates a Java process in the background.
              Please make sure you have installed Java 1.6 or higher.

              sudo apt-get install default-jre
              sudo apt-get install default-jdk
    '''
    _logger.info("Pos-tagging for requirements' tokens")

    '''
        See: http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html
        --------------------------------------------------------------------------------------------
        Tag    Description                         Examples
        --------------------------------------------------------------------------------------------
        CC     conjunction, coordinating           & 'n and both but either et for less minus neither nor or plus so therefore times v. versus vs. whether yet
        CD     numeral, cardinal                   mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025 fifteen 271,124 dozen quintillion DM2,000 ...
        DT     determiner                          all an another any both del each either every half la many much nary neither no some such that the them these this those
        EX     existential there                   there
        FW     foreign word                        gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte terram fiche oui corporis ...
        IN     preposition or conjunction, subordinating astride among uppon whether out inside pro despite on by throughout below within for towards near behind atop around if like until below next into if beside ...
        JJ     adjective or numeral,ordinal        third ill-mannered pre-war regrettable oiled calamitous first separable ectoplasmic battery-powered participatory fourth still-to-be-named multilingual multi-disciplinary ...
        JJR    adjective, comparative              bleaker braver breezier briefer brighter brisker broader bumper busier calmer cheaper choosier cleaner clearer closer colder commoner costlier cozier creamier crunchier cuter ...
        JJS    adjective, superlative              calmest cheapest choicest classiest cleanest clearest closest commonest corniest costliest crassest creepiest crudest cutest darkest deadliest dearest deepest densest dinkiest ...
        LS     list item marker                    A A. B B. C C. D E F First G H I J K One SP-44001 SP-44002 SP-44005 SP-44007 Second Third Three Two \* a b c d first five four one six three two
        MD     modal auxiliary                     can cannot could couldn't dare may might must need ought shall should shouldn't will would
        NN     noun, common, singular or mass      common-carrier cabbage knuckle-duster Casino afghan shed thermostat investment slide humour falloff slick wind hyena override subhumanity machinist ...
        NNP    noun, proper, singular              Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA Shannon A.K.C. Meltex Liverpool ...
        NNPS   noun, proper, plural                Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques Apache Apaches Apocrypha ...
        NNS    noun, common, plural                undergraduates scotches bric-a-brac products bodyguards facets coasts divestitures storehouses designs clubs fragrances averages subjectivists apprehensions muses factory-jobs ...
        PDT    pre-determiner                      all both half many quite such sure this
        POS    genitive marker                     ' 's
        PRP    pronoun, personal                   hers herself him himself hisself it itself me myself one oneself ours ourselves ownself self she thee theirs them themselves they thou thy us
        PRP$   pronoun, possessive                 her his mine my our ours their thy your
        RB     adverb                              occasionally unabatingly maddeningly adventurously professedly stirringly prominently technologically magisterially predominately swiftly fiscally pitilessly ...
        RBR    adverb, comparative                 further gloomier grander graver greater grimmer harder harsher healthier heavier higher however larger later leaner lengthier less-perfectly lesser lonelier longer louder lower more ...
        RBS    adverb, superlative                 best biggest bluntest earliest farthest first furthest hardest heartiest highest largest least less most nearest second tightest worst
        RP     particle                            aboard about across along apart around aside at away back before behind by crop down ever fast for forth from go high i.e. in into just later low more off on open out over per pie raising start teeth that through under unto up up-pp upon whole with you
        TO     "to" as preposition or infinitive marker    to
        UH     interjection                        Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly man baby diddle hush sonuvabitch ...
        VB     verb, base form                     ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ...
        VBD    verb, past tense                    dipped pleaded swiped regummed soaked tidied convened halted registered cushioned exacted snubbed strode aimed adopted belied figgered speculated wore appreciated contemplated ...
        VBG    verb, present participle or gerund  telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting erasing wincing ...
        VBN    verb, past participle               multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ...
        VBP    verb, present tense, not 3rd person singular    predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ...
        VBZ    verb, present tense, 3rd person singular  bases reconstructs marks mixes displeases seals carps weaves snatches slumps stretches authorizes smolders pictures emerges stockpiles seduces fizzes uses bolsters slaps speaks pleads ...
        WDT    WH-determiner                       that what whatever which whichever
        WP     WH-pronoun                          that what whatever whatsoever which who whom whosoever
        WP$    WH-pronoun, possessive              whose
        WRB    Wh-adverb                           how however whence whenever where whereby whereever wherein whereof why

        See: https://www.sketchengine.co.uk/german-stts-part-of-speech-tagset/
        --------------------------------------------------------------------------------------------
        Tag	Description	Example
        --------------------------------------------------------------------------------------------
        ADJA	attributive adjective (including participles used adjectivally)	das große Haus die versunkene Glocke
        ADJD	predicate adjective; adjective used adverbially	der Vogel ist blau er fährt schnell
        ADV	adverb (never used as attributive adjective)	sie kommt bald
        APPR	preposition left hand part of double preposition	auf dem Tisch an der Straße entlang
        APPRART	preposition with fused article	am Tag
        APPO	postposition	meiner Meinung nach
        APZR	right hand part of double preposition	an der Straße entlang
        ART	article (definite or indefinite)	die Tante; eine Tante
        CARD	cardinal number (words or figures); also declined	zwei; 526; dreier
        FM	foreign words (actual part of speech in original language may be appended, e.g. FMADV/ FM-NN)	semper fidem
        ITJ	interjection	Ach!
        KON	co-ordinating conjunction	oder ich bezahle nicht
        KOKOM	comparative conjunction or particle	er arbeitet als Straßenfeger, so gut wie du
        KOUI	preposition used to introduce infinitive clause	um den König zu töten
        KOUS	subordinating conjunction	weil er sie gesehen hat
        NA	adjective used as noun	der Gesandte
        NE	names and other proper nouns	Moskau
        NN	noun (but not adjectives used as nouns)	der Abend
        PAV [PROAV]	pronominal adverb	sie spielt damit
        PAVREL	pronominal adverb used as relative	die Puppe, damit sie spielt
        PDAT	demonstrative determiner	dieser Mann war schlecht
        PDS	demonstrative pronoun	dieser war schlecht
        PIAT	indefinite determiner (whether occurring on its own or in conjunction with another determiner)	einige Wochen, viele solche Bemerkungen
        PIS	indefinite pronoun	sie hat viele gesehen
        PPER	personal pronoun	sie liebt mich
        PRF	reflexive pronoun	ich wasche mich, sie wäscht sich
        PPOSS	possessive pronoun	das ist meins
        PPOSAT	possessive determiner	mein Buch, das ist der meine/meinige
        PRELAT	relative depending on a noun	der Mann, dessen Lied ich singe […], welchen Begriff ich nicht verstehe
        PRELS	relative pronoun (i.e. forms of der or welcher)	der Herr, der gerade kommt; der Herr, welcher nun kommt
        PTKA	particle with adjective or adverb	am besten, zu schnell, aufs herzlichste
        PTKANT	answer particle	ja, nein
        PTKNEG	negative particle	nicht
        PTKREL	indeclinable relative particle	so
        PTKVZ	separable prefix	sie kommt an
        PTKZU	infinitive particle	zu
        PWS	interrogative pronoun	wer kommt?
        PWAT	interrogative determiner	welche Farbe?
        PWAV	interrogative adverb	wann kommst du?
        PWAVREL	interrogative adverb used as relative	der Zaun, worüber sie springt
        PWREL	interrogative pronoun used as relative	etwas, was er sieht
        TRUNC	truncated form of compound	Vor- und Nachteile
        VAFIN	finite auxiliary verb	sie ist gekommen
        VAIMP	imperative of auxiliary	sei still!
        VAINF	infinitive of auxiliary	er wird es gesehen haben
        VAPP	past participle of auxiliary	sie ist es gewesen
        VMFIN	finite modal verb	sie will kommen
        VMINF	infinitive of modal	er hat es sehen müssen
        VMPP	past participle of auxiliary	sie hat es gekonnt
        VVFIN	finite full verb	sie ist gekommen
        VVIMP	imperative of full verb	bleibt da!
        VVINF	infinitive of full verb	er wird es sehen
        VVIZU	infinitive with incorporated zu	sie versprach aufzuhören
        VVPP	past participle of full verb	sie ist gekommen
    '''
    pos_tags_black_list = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RP', 'TO', 'UH', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
    #pos_tags_black_list = ['CC', 'CD', 'DT', 'EX', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RP', 'TO', 'UH', 'WDT', 'WP', 'WP$', 'WRB']
    existing_stanford_pos_tags = set()
    removed_stanford_tokens = set()
    # Note: "-mx30g" sets java's max memory size to 30 GB RAM
    #       Please change when experiencing OS-related problems!
    pos_tagger = StanfordPOSTagger(pos_tagger_data_path, pos_tagger_jar_path, java_options='-mx30g')

    def restore_german_umlauts(tokens):
        return map(lambda t: t.replace("ue", "ü").replace("oe", "ö").replace("ae", "ä"), tokens)

    for requirement in requirements:
        corrected_words = restore_german_umlauts(requirement.description_tokens)
        pos_tagged_requirement_tokens = pos_tagger.tag(corrected_words)
        #tagged_tokens = filter(lambda t: t[1] not in pos_tags_black_list, pos_tagged_requirement_tokens)
        #requirement.description_tokens_pos_tags = map(lambda t: t, tagged_tokens)
        requirement.description_tokens_pos_tags = map(lambda t: t, pos_tagged_requirement_tokens)
        #removed_stanford_tokens |= set(filter(lambda t: t[1] in pos_tags_black_list, pos_tagged_requirement_tokens))
        #existing_stanford_pos_tags |= set(map(lambda t: t[1], pos_tagged_requirement_tokens))

        corrected_words = restore_german_umlauts(requirement.title_tokens)
        pos_tagged_title_tokens = pos_tagger.tag(corrected_words)
        #tagged_tokens = filter(lambda t: t[1] not in pos_tags_black_list, pos_tagged_title_tokens)
        #requirement.title_tokens_pos_tags = map(lambda t: t, tagged_tokens)
        requirement.title_tokens_pos_tags = map(lambda t: t, pos_tagged_title_tokens)
Пример #2
0
    def go_func(self):
        global c
        c = word_tokenize(str(text1.toPlainText()).lower())
        st = StanfordPOSTagger('C:\stanford-postagger-2015-12-09\models\english-bidirectional-distsim.tagger')
        b = st.tag(c)

        
        #print(b)
        flag = 0
        loopflag = 0

            # decflag=0
            # iniflag=0
            # scanflag=0
            # pflag=0
            # aflag=0
            # print(b)

        dict = {"VB": [], "NN": [], "JJ": [], "DT": [], "CC": [], "PR": [], "CD": [], "IN": [], "RB": []}

        for (w,t) in b:
            if (t[:2] in dict):
                dict[t[:2]].append(w)

        print(dict)

        
        #checking if the statement is a declaration
        try:
            for i in dict["VB"]:
                for d in declarations:
                    if d in i:
                        dict["VB"].remove(i)
                        #decflag = 1
                        flag = 1
                        
                        self.declare(dict)
                        break
                    #if decflag == 1:
                if flag == 1:
                    break
            
            #checking if the statement is an initialization

            if(flag == 0):
            #if(decflag == 0):
                for i in dict["VB"]:
                    for ini in initialize:
                        if ini in i:
                            flag = 1
                            #print("hermoine")
                            #iniflag = 1                          
                            self.init(dict)
                            break
                    #if iniflag == 1:
                    if flag == 1:
                        break


            if(flag == 0):
#if(decflag == 0 and iniflag == 0):
                for i in dict["NN"] or dict["VB"]:
                    for p in printer:
                        if p in i:
                            flag =1
                            #pflag = 1
                            self.prin(dict)
                            break
                    if flag == 1:
                    #if pflag == 1:
                        break


            #Scanning input from screen
            if(flag == 0):
            #if(decflag == 0 and iniflag == 0 and pflag == 0):
                for i in dict["VB"] or dict["NN"]:
                    for s in scanner:
                        if s in i:
                            #dict["VB"].remove(i)
                            flag = 1
                            #scanflag = 1
                            self.scan(dict)
                            break
                    #if scanflag == 1:
                    if flag == 1:
                        break
                    #if scanflag == 0:
                    if flag == 0:
                        if("take from user" in a or "take from screen" in a or "take in" in a):
                            self.scan(dict)
                            flag = 1
                            break
                            #scanflag = 1

            if(flag == 0):
            
                for i in dict["IN"] + dict["JJ"] + dict["CC"] + dict["RB"]:

                    for cond in conditional:
                        if cond in i:
                            flag = 1
                            
                            self.cond(dict)
                            break
                        if flag == 1:
                            break   
         

            #arithmetic operations
            #if(decflag==0 and iniflag==0 and pflag==0 and scanflag == 0):
            if(flag == 0):
            
                for i in dict["VB"] + dict["NN"] + dict["CC"]:
                    
                    for art in arithmetic:
                        #print("here     ")
                        #print(a,i)
                        if art in i:
                            flag = 1
                            #aflag = 1
                            t = self.arithmo(dict)
                            print(t)
                            break
                        if flag == 1:
                        #if aflag==1:
                            break
            #print(flag)
           
           


        except Exception:

            print("exception")
Пример #3
0
import nltk
from nltk.tag.stanford import StanfordPOSTagger

cale_model = "/home/t3rtius/Documents/cs/sla-master/sem1/1-nlp-opt/" + \
    "stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \
    "models/english-bidirectional-distsim.tagger"

cale_jar_tagger = "/home/t3rtius/Documents/cs/sla-master/sem1/1-nlp-opt/" + \
    "stanford-pos-tagger/stanford-postagger-full-2018-10-16/" + \
    "stanford-postagger.jar"

tagger = StanfordPOSTagger(cale_model, cale_jar_tagger)

text = "There once was a prince and he lived in a castle " +\
    "and his name was Prince Charming."

cuvInProp = nltk.word_tokenize(text)
morfo = tagger.tag(cuvInProp)
print("Analiza morfologică este:")
morfoDict = dict(morfo)
for parte in morfoDict.values():
    print(parte)
                                           for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    return continuous_chunk


# Alternatively to setting the CLASSPATH add the jar and model via their path:
jar = 'C:/stanford-postagger-2015-04-20/stanford-postagger.jar'
model = 'C:/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger'

pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

jar = 'C:/stanford-ner-2015-04-20/stanford-ner.jar'
model = 'C:/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

posTagger = pos_tagger
nerTagger = ner_tagger

sentence = "Where can I find some bubble tea??"


def queryGenrator(sentence):
    #find words in quotes
    quoted = re.findall(r'"([^"]*)"', sentence)
Пример #5
0
#!usr/bin/env python
#-*- coding:utf-8 _*-
"""
@author:fonttian
@file: Stanford04.py
@time: 2017/09/26
"""

from nltk.tag.stanford import StanfordPOSTagger

st = StanfordPOSTagger(
    '/home/fonttian/NLP/stanford-postagger-full-2015-12-09/models/chinese-distsim.tagger',
    "/home/fonttian/NLP/stanford-postagger-full-2015-12-09/stanford-postagger-3.6.0.jar"
)

print(st)
Пример #6
0
##export CLASSPATH=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/stanford-postagger.jar:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/stanford-ner.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser.jar:$STANFORDTOOLSDIR/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar
##export STANFORD_MODELS=$STANFORDTOOLSDIR/stanford-postagger-full-2015-04-20/models:$STANFORDTOOLSDIR/stanford-ner-2015-04-20/classifiers

from nltk.tag.stanford import StanfordPOSTagger
from nltk.parse.stanford import StanfordParser
from nltk.corpus import stopwords

print("Sentence segmentation")
tokens = "this is pune.Pune is a great city"
tokens = tokens.split(".")
print(tokens)

print("\nTokenizer:")
tokens = "this is pune"
tokens = tokens.split(" ")
print(tokens)

print("\nStop Words Removal:")
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in tokens if not w in stop_words]
print(filtered_words)

st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
print("\nPOS tagging:")
print(st.tag('What is the airspeed of an unladen swallow ?'.split()))

parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
print("\nSyntax Parser:")
print(list(parser.raw_parse("rahul daksh fire")))
Пример #7
0
#word2vec(どのくらい似てるか)
import gensim.downloader as api
import sys
from voice_common_pkg.srv import GgiLearning
from voice_common_pkg.srv import GgiLearningResponse
from nltk.tag.stanford import StanfordPOSTagger
import rospy
import random

file_path = path.expanduser('~/catkin_ws/src/voice_common_pkg/config')
minimum_value = 0.5  #コサイン類似度の最低値
#ベクトル読み込み
word_vectors = api.load("glove-twitter-200")
#nltkのモデルを読み込む
pos_tag = StanfordPOSTagger(
    model_filename=file_path +
    "/stanford-postagger/models/english-bidirectional-distsim.tagger",
    path_to_jar=file_path + "/stanford-postagger/stanford-postagger.jar")


class GgiTest():
    def __init__(self):
        #ベクトル読み込み
        print('Wahing for tts and stt_server')
        rospy.wait_for_service('/tts')
        rospy.wait_for_service('/stt_server')
        print('test_phase is ready')
        self.stt = rospy.ServiceProxy('/stt_server', SpeechToText)
        self.tts = rospy.ServiceProxy('/tts', TTS)
        self.server = rospy.Service('/test_phase', GgiLearning, self.main)

    def main(self, req):
Пример #8
0
def main(start, stop):
	start_time = time.time()
	print 'start stop = ', start, stop

	stanford_dir = '.\data\stanford-postagger-full-2015-04-20\\'
	modelfile = stanford_dir + 'models\english-bidirectional-distsim.tagger'
	jarfile = stanford_dir + 'stanford-postagger.jar'

	tagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)

	# sentence = 'How can I increase the speed of my internet connection while using a VPN?'
	# print tagger.tag(sentence.split())

	# for word, tag in tagger.tag(nltk.word_tokenize(sentence)):
	# 	print 'word: ',word,'\ttag: ',tag

	count = 0
	real_matches = 0
	correct_matches = 0
	wrong_matches = 0
	with open(".\data\quora_duplicate_questions.tsv") as tsvfile:
	    tsvreader = csv.reader(tsvfile, delimiter="\t")
	    #print 'inside thread call....start = ',start,"\tstop = ",stop
	    #print 'inside run = ', (stop/100)
	    for line in tsvreader:

	    	count = count + 1
	    	#print 'count = ', count 

	    	if count<start:
	    		continue

	    	if count>stop:
	    		break

	    	print "-------------------------------"
	    	print "count: ", count
	    	print "-------------------------------"
	    	duplicate = line[5]
	    	print'duplicate',duplicate
	    	if (duplicate == '1'):
	    		real_matches = real_matches + 1

	    	q1 = tagger.tag(nltk.word_tokenize(line[3].lower()))
	    	q2 = tagger.tag(nltk.word_tokenize(line[4].lower()))

	    	q1_nouns = dict()
	    	for word, tag in q1:
	    		if tag[:2] == 'NN':
	    			q1_nouns[word] = 1

	    	q2_nouns = dict()
	    	for word, tag in q2:
	    		if tag[:2] == 'NN':
	    			q2_nouns[word] = 1

	    	print 'nouns q1: ', q1_nouns,'\tnouns q2: ', q2_nouns

	    	noun_match = False
	       	for key, value in q1_nouns.items():
	       		if (key in q2_nouns):
	       			noun_match = True
	       			break
	       	print 'noun_match: ', noun_match
	       	if (noun_match):
	       		q1_verbs = dict()
	       		for word, tag in q1:
	       			if tag[:2] == 'VB':
	       				q1_verbs[word] = 1

	       		q2_verbs = dict()
	       		for word, tag in q2:
	       			if tag[:2] == 'VB':
	       				q2_verbs[word] = 1

	       		print 'verbs q1: ',q1_verbs,'\tverbs q2: ', q2_verbs

	       		verb_match = False

	       		if len(q1_verbs) == 0 or len(q2_verbs) == 0:
	       			verb_match = True
	       			continue

	       		for key, value in q1_verbs.items():
	       			if (key in q2_verbs):
	       				verb_match = True
	       				break
	       			if (verb_match == False and len(wn.synsets(key))>0):
	       				syn = wn.synsets(key)[0]
	       				for lemma in syn.lemmas():
	       					print 'synonym of ', key, ": ", lemma.name()
	       					if (lemma.name() in q2_verbs):
	       						verb_match = True
	       						break

	       		if (verb_match == False) :
		       		for key, value in q2_verbs.items():
		       			if (key in q1_verbs):
		       				verb_match = True
		       				break
		       			if (verb_match == False and len(wn.synsets(key))>0):
		       				print key
		       				syn = wn.synsets(key)[0]
		       				for lemma in syn.lemmas():
		       					print 'synonym of ', key, ": ", lemma.name()
		       					if (lemma.name() in q1_verbs):
		       						verb_match = True
		       						break

	       		if (verb_match):
	       			#print ' '
	       			if (line[5] == '1'):
	       				print 'DUPLICATE'
	       				correct_matches = correct_matches + 1;
	       			else:
	       				print 'DUPLICATE BUT WRONG!!!!!'
	       				wrong_matches = wrong_matches + 1;


	print 'correct_matches : ', correct_matches
	print 'wrong_matches : ', wrong_matches
	print 'real_matches : ', real_matches

	print('\n\nrun time : ', time.time() - start_time)
Пример #9
0
from nltk.tag.stanford import StanfordPOSTagger
st = StanfordPOSTagger(
    'edu\\stanford\\nlp\\models\\pos-tagger\\english-bidirectional\\english-bidirectional-distsim.tagger'
)
from ansWhat import ansWhat
from ansWhere import ansWhere
from ansWhen import ansWhen
from ansHow import ansHow
from ansYesNo import ansYesNo
from ansWhy import ansWhy
from ansYesNo import intersection

lemmatizer = WordNetLemmatizer()

# Use stanford pos for lemmatization here
# The Pos Tagger in NLTK will recognize words like "sits", "walks" as "NNS" instead of verb.
stanford_pos = '../stanford/stanford-postagger-full-2015-04-20/'
stanford_pos_model = stanford_pos + 'models/english-left3words-distsim.tagger'
stanford_pos_jar = stanford_pos + 'stanford-postagger.jar'
st_pos = StanfordPOSTagger(model_filename=stanford_pos_model,
                           path_to_jar=stanford_pos_jar)

# # NER Tagging:
stanford_ner = '../stanford/stanford-ner-2015-04-20/'
stanford_ner_model = stanford_ner + 'classifiers/english.muc.7class.distsim.crf.ser.gz'
stanford_ner_jar = stanford_ner + 'stanford-ner.jar'
ner = StanfordNERTagger(model_filename=stanford_ner_model,
                        path_to_jar=stanford_ner_jar)

# Set up the stanford PCFG parser
stanford_parser_dir = '../stanford/stanford-parser-full-2015-04-20/'
eng_model_path = stanford_parser_dir + "englishPCFG.ser.gz"
my_path_to_models_jar = stanford_parser_dir + "stanford-parser-3.5.2-models.jar"
my_path_to_jar = stanford_parser_dir + "stanford-parser.jar"
parser = StanfordParser(model_path=eng_model_path,
                        path_to_models_jar=my_path_to_models_jar,
questions_features = os.path.join("..", "data", "questions_features.tsv")
pickle_SVM_model = os.path.join("..", "package", "model_SVM.pickle")
pickle_SVM_vector = os.path.join("..", "package", "vector_SVM.pickle")
questions_list = []
labels = []
with open(questions_file, 'r') as filename:
    data = csv.reader(filename, delimiter=',')
    for line in data:
        questions_list.append(line[0])
        labels.append(line[1])

# print type(questions_list), type(labels)
tagger_path = r"/Users/avaniarora/Desktop/stanford-postagger-2016-10-31/models/english-bidirectional-distsim.tagger"
pos_jar = r"/Users/avaniarora/Desktop/stanford-postagger-2016-10-31/stanford-postagger.jar"

tagger = StanfordPOSTagger(tagger_path, pos_jar)
tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences

wh_tags = ['WP', 'WDT', 'WP$', 'WRB']
important_tags = ['VB', 'VBP', 'VBZ', 'VBD', 'VBG', 'VBN']


def get_wh_words_from_question(question):
    sentence = word_tokenize(question)
    pos_tags = nltk.pos_tag(sentence)

    entities = {}

    for pos_tag in pos_tags:
        if (pos_tag[1] in wh_tags):
Пример #12
0
    }[t.lang](tweet_text))

    # So it turns out that Python doesn't have switch-cases because, you guessed it, Guido is a total f*****g melon.
    # The mapping inside the `update` call is an equivalent structure, something I learned from working with old
    # Javascript code and passing around functions

    global tweet_counter
    global num_tweets
    logging.info(f"Analyzed tweet [{tweet_counter}/{num_tweets}]")
    tweet_counter += 1

    return t.id, row


german_tagger = StanfordPOSTagger(
    model_filename=(tagger_path + 'models/german-hgc.tagger'),
    path_to_jar=(tagger_path + 'stanford-postagger.jar')
)
def _analyze_de(text):
    tags = german_tagger.tag(word_tokenize(text, language='german'))
    counter = collections.Counter([x[1] for x in tags])

    return {  # we need to map the STTS (German) tagset to a subset of the French tagset, so that we can compare them all
        'ADJ': counter['ADJA'] + counter['ADJD'],
        'ADV': counter['ADV'],
        'CC': counter['KON'] + counter['KOKOM'],
        'CS': counter['KOUI'] + counter['KOUS'],
        'ET': counter['FM'],
        'I': counter['ITJ'],
        'NC': counter['NN'],
        'NP': counter['NE'],
        'PREF': counter['APPO'] + counter['APPR'] + counter['APPRART'] + counter['APZR'],
Пример #13
0
from nltk.tag.stanford import StanfordPOSTagger

englishPOStagger = StanfordPOSTagger('/home/deep/StanfordPosTagger/models/english-bidirectional-distsim.tagger','/home/deep/StanfordPosTagger/stanford-postagger.jar')
java_path='/usr/bin/java'
os.environ['JAVAHOME'] = java_path
def updatePosDict(tokens,PosDict):
	tags = englishPOStagger.tag(tokens)
	for token,tag in tags:
		PosDict[tag] += 1

PosAgrees = defaultdict(float)
PosDisagrees = defaultdict(float)
PosDiscuss = defaultdict(float)
NumDiscuss = 0.0
NumAgreed = 0.0
NumDisagreed = 0.0
for i,lst in enumerate(miniList):
	if lst[2] == 'agree':
		NumAgreed += 1.0
		try:
			updatePosDict(bodyDict[lst[1]] , PosAgrees)
		except OSError:
			print (i) 
			
	elif lst[2] == 'disagree':
		NumDisagreed += 1.0
		try:
			updatePosDict(bodyDict[lst[1]] , PosDisagrees)
		except OSError:
			print (i) 
	elif lst[2] == 'discuss':
Пример #14
0
from config import *

import ServerPrint as sp
import numpy as np

"""
    This program would load the stanford POS tagger
    Moreover, it provide several function to parse the structure of the sentence
"""

# The parser .jar path
jar = "/home/sunner/nltk_data/models/stanford-postagger-full-2015-12-09/stanford-postagger.jar"
model = "/home/sunner/nltk_data/models/stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger"

# Load the parser
tagger = StanfordPOSTagger("english-bidirectional-distsim.tagger")
sp.show("Finish POS tagger loading")

# Variable
sentences = ""
parseString = ""
word_2_pos = OrderedDict()
__verb = ""
__subject = ""
__object = ""
__value = ""
wordEmbedded = np.array([[1, 0], [0, 1]])

def tag(record, string="I love you"):
    """
        Main function to do the POS tagging
Пример #15
0
#             ans = item[0]
#             return ans
#     return ''

# NERTagger

ner_dir = '/Users/yifan/Desktop/WSTA_project/stanford-ner-2018-02-27/'
ner_jarfile = ner_dir + 'stanford-ner.jar'
ner_modelfile = ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_filename=ner_modelfile,
                               path_to_jar=ner_jarfile)

pos_dir = '/Users/yifan/Desktop/WSTA_project/stanford-postagger-2018-02-27/'
pos_modelfile = pos_dir + 'models/english-bidirectional-distsim.tagger'
pos_jarfile = pos_dir + 'stanford-postagger.jar'
pos_tagger = StanfordPOSTagger(model_filename=pos_modelfile,
                               path_to_jar=pos_jarfile)

sents = []
# sents.append("Rami Eid is studying at Stony Brook University in NY")
# sents.append("Elbert lives in Melbourne.")
# sents.append("Banana is $6 per kilo this Monday in Sydney.")
# sents.append("Elbert lives 100 kilometers away from Melbourne.")
# sents.append("Elbert is enrolled in University of Melbourne .")
# sents.append("Elbert in Melbourne.")
# sents.append("Elbert's address is Unit 1004 50 Albert Road South Melbourne 3205.")
# sents.append("The price of a bottle of water is $1")
# sents.append("The event will be on Sunday.")
# sents.append("The event will be on 1pm.")
# sents.append("Elbert will submit the assignment by email.")
# sents.append("Elbert will submit the assignment by email tomorrow.")
# sents.append("Elbert will submit the assignment tomorrow.")
Пример #16
0
import nltk
import os
import re
#java_path = "/usr/bin/java"
#os.environ['JAVAHOME'] = java_path
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-oracle"
#from nltk.tag.stanford import POSTagger
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tag.stanford import StanfordTagger
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tag.stanford import StanfordNERTagger
path_to_model = '/home/noura/stanford-postagger-full-2016-10-31/models/arabic.tagger'
path_to_jar = '/home/noura/stanford-postagger/stanford-postagger.jar'
#artagger.java_options='-mx4096m'
artagger = StanfordPOSTagger(path_to_model, path_to_jar, encoding='utf8')
artagger._SEPARATOR = '/'
tagged_sent = artagger.tag(
    word_tokenize(
        u'ممتاز و جيد لطيف أشياء أفعال وهذا نص للتأكد فقط لا غير أنا تسلقت شجرة'
    ))
print(tagged_sent)
Пример #17
0
import nltk
from collections import Counter
import timeit

from nltk.tag.stanford import StanfordPOSTagger

st = StanfordPOSTagger(
    '/home/saurav/Documents/postagger/models/english-bidirectional-distsim.tagger',
    '/home/saurav/Documents/postagger/stanford-postagger.jar')

Prob = {}
infile = open('probabilities.txt', 'r')
text = infile.readlines()

for sentence in text:
    keyValPair = sentence.split(":")
    Prob[keyValPair[0]] = float(keyValPair[1][:-1])

infile.close()


def prob(sequence):
    if sequence in Prob.keys():
        return Prob[sequence]
    else:
        return 0


def fairSCP(sequence):
    numerator = prob(sequence) * prob(sequence)
    sequence = sequence.split()
Пример #18
0
with open('english_lemma.pickle', 'wb') as f:
	pickle.dump(lemmatized_sent, f)




english_sentence_structure={}
#-------------for each sentence create a sentence structure-------
for i in eng_sent:
	sentence=i
	english_sentence_structure[sentence]=[]

	#--------------add number of words to sentence structure--------
	english_sentence_structure[sentence].append(len(sentence.split()))
	
	#--------------postag each sentence---------------------
	distsim = 'english-bidirectional-distsim.tagger'
	post = 'stanford-postagger.jar'
	english_postagger = StanfordPOSTagger(distsim, post)
	postag = english_postagger.tag(sentence.split())

	#-----------add postag to sentence structure-------------
	english_sentence_structure[sentence].append(list(postag))

	#---------call fuction to get phrases---------------
	SenToPhrase(postag)

#---------store the sentence structure------------------------
with open('english_struct.pickle', 'wb') as handle:
    pickle.dump(english_sentence_structure, handle, protocol=pickle.HIGHEST_PROTOCOL)
Пример #19
0
# coding:utf-8
import nltk
import json
from nltk.tag.stanford import StanfordPOSTagger
from collections import Counter
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import numpy as np

tagger = StanfordPOSTagger(
    model_filename='/workspace/english-bidirectional-distsim.tagger',
    path_to_jar='/workspace/stanford-postagger.jar')

dataset_path = '../annotations/captions_ucm_total.json'
semantic_path = '/data/UCM_captions/stanford_semantic_words.json'
# dataset_path = '../annotations/captions_sydney_total.json'
# semantic_path = '/data/Sydney_captions/stanford_semantic_words.json'
# dataset_path = '../annotations/captions_rsicd_total.json'
# semantic_path = '/data/RSICD/stanford_semantic_words.json'

semantic_dict = dict()
# tag_dict = dict()
# count = Counter()
# with open(dataset_path, 'r') as f:
#     data = json.load(f)
#     print(data['dataset'])
#     for img in data['images']:
#         imgid = img['imgid']
#         # semantic_list = []
#         for sent in img['sentences']:
#             # print(sent['tokens'])
Пример #20
0
# Import StanfordPOSTagger and StanfordNERTagger from nltk.tag.stanford
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger

# Set JAVAHOME variable to the directory containin Java on your computer
import os
#path = "/usr/bin/java"
java_path = '/Library/Internet Plug-Ins/JavaAppletPlugin.plugin/Contents/Home/bin/java'
os.environ['JAVAHOME'] = java_path

# Using Stanford POS Tagger
# set the path for POS tagger: the jar file and the model
pos_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-postagger-full-2017-06-09/models/english-bidirectional-distsim.tagger'
pos_jar_path = '/Users/abhilashakumari/Downloads/textmining/stanford-postagger-full-2017-06-09/stanford-postagger.jar'

# Initialize the tagger
st_pos = StanfordPOSTagger(pos_model_path, pos_jar_path)
st_pos.tag(word_tokenize(sent))

# Using Stanford NER Tagger
# set the path for NER tagger: the jar file and the model
ner_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz'
ner_jar_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/stanford-ner.jar'

st_ner = StanfordNERTagger(ner_model_path, ner_jar_path)
sent_ne = st_ner.tag(word_tokenize(sent))

# Check out the 7 class tagged english.muc.7class.distsim.crf.ser.gz model for more flexibility.
# It tags Currency, Location, Percentages along with Persons, Organizations etc.
ner7_model_path = '/Users/abhilashakumari/Downloads/textmining/stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz'
st_ner7 = StanfordNERTagger(ner7_model_path, ner_jar_path)
Пример #21
0
from sklearn.cluster import KMeans, DBSCAN
from sklearn.utils.linear_assignment_ import linear_assignment
from utils import common
from tokenize_and_normalize import convert_num, tokenize_and_pos_tagging
#from utils.features import NGramVectorizer, DepNGramVectorizer
import utils.features
try:
  import cPickle as pickle
except:
  import pickle


from nltk.tag.stanford import StanfordPOSTagger
TAGGER_DIR = '/home/shoetsu/downloads/stanford-postagger'
tagger = StanfordPOSTagger(
  TAGGER_DIR + '/models/english-left3words-distsim.tagger',
  TAGGER_DIR + '/stanford-postagger.jar'
)

KMEANS_STR = 'kmeans'
DBSCAN_STR = 'dbscan'
MODEL_NAME = 'cluster.model'
CONFIG_NAME = 'config'
NUM = common.NUM
NUMBER = common.NUMBER
NONE = '-'
stop_words = set(['.', ',', '!', '?'])
VOCAB_CONDITION = lambda x : True if set([NUM, NUM.lower(), NUMBER, NUMBER.lower()]).intersection(x) and not stop_words.intersection(set(x)) else False


#####################################
##        Extraction
Пример #22
0
from nltk.tag.stanford import StanfordPOSTagger
import os
java_path = "C:/Program Files/Java/jdk1.8.0_31/bin/java.exe"
os.environ['JAVAHOME'] = java_path


english_postagger = StanfordPOSTagger('D:/BtechProject/stanford-postagger/models/english-bidirectional-distsim.tagger' ,
                                   'D:/BtechProject/stanford-postagger/stanford-postagger.jar')

sentence = "NANDAN SUKTHANKAR PRANAY SANKET DESHMUKH"
print(english_postagger.tag(sentence.split()))
#op_file = open("output.txt", "w")
"""
with open('student_corpus.txt') as fp:
    for line in fp:
        sentence = line.strip('\n')
        token_array = english_postagger.tag(sentence.split())
        op_file.write("\n".join((str(elem) for elem in token_array)))
        print(sentence)
"""
#ct = CRFTagger()
#print(ct.tag(text.split()))
Пример #23
0
from arguments import define_args
from tqdm import tqdm

nltk.download('punkt')

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO)
logger = logging.getLogger(__name__)

args = define_args()

if args.return_POS:
    from nltk.tag.stanford import StanfordPOSTagger
    pos_tagger = StanfordPOSTagger(args.pos_tagger_model, args.pos_tagger_jar)
if args.return_NER:
    from nltk.tag.stanford import StanfordNERTagger
    ner_tagger = StanfordNERTagger(args.ner_tagger_model, args.ner_tagger_jar)
if args.return_DEP:
    from nltk.parse.stanford import StanfordDependencyParser
    dep_parser = StanfordDependencyParser(args.dep_parser_model,
                                          args.dep_parser_jar)
if args.return_CONST:
    import benepar
    benepar.download('benepar_en2')
    const_parser = benepar.Parser("benepar_en2")


def get_constituency_path_to_root(tree, leaf_index):
    parented_tree = nltk.tree.ParentedTree.convert(tree)
Пример #24
0
### [4] register the path in your bashrc or whatever. The name must be STANFORD_TAGGER_PATH.
###        For example,
###                export STANFORD_TAGGER_PATH=/home/sanghoun/tools/stanford-postagger
### [5] either do [ $ source ~/.bashrc ] or
###        logout and login again.

import sys, os
import nltk
from nltk.tag.stanford import StanfordPOSTagger

if "STANFORD_TAGGER_PATH" not in os.environ:
    print("STANFORD_TAGGER_PATH is not registered.")
    sys.exit()

tagger = StanfordPOSTagger(
    os.environ['STANFORD_TAGGER_PATH'] + "/models/chinese-distsim.tagger",
    os.environ['STANFORD_TAGGER_PATH'] + "/stanford-postagger.jar",
    encoding='utf-8')

while True:
    try:
        line = sys.stdin.readline()
    except (KeyboardInterrupt, IOError):
        break
    if not line or len(line.strip()) < 1: break

    sentence = line

    output = ""
    #print words.strip().split()
    words = sentence.split()
Пример #25
0
    def create_listen_func(self):
        #btn2.setEnabled(True)
        #btn1.setEnabled(False)
        #text1.setText("")


        #listening
        
        with sr.Microphone(sample_rate = 16000) as source:
            audio = r.listen(source)	

        print("Listening")
 

        try:
            lines = r.recognize_google(audio)
            print("you said " + lines)
            text1.setText(lines)
        except LookupError:
            print("Couldn't understand audio")
            text1.setText("Couldn't understand audio")
            
        c = word_tokenize(lines.lower())
        st = StanfordPOSTagger('C:\stanford-postagger-2015-12-09\models\english-bidirectional-distsim.tagger')
        b = st.tag(c)

        
        #print(b)
        flag = 0
        loopflag = 0

            # decflag=0
            # iniflag=0
            # scanflag=0
            # pflag=0
            # aflag=0
            #print(b)

        dict = {"VB": [], "NN": [], "JJ": [], "DT": [], "CC": [], "PR": [], "CD": [], "IN": [], "RB": []}

        for (w,t) in b:
            if (t[:2] in dict):
                dict[t[:2]].append(w)

        print(dict)

        
        #checking if the statement is a declaration
        try:
            for i in dict["VB"]:
                for d in declarations:
                    if d in i:
                        dict["VB"].remove(i)
                        #decflag = 1
                        flag = 1
                        
                        self.declare(dict)
                        break
                    #if decflag == 1:
                if flag == 1:
                    break
            
            #checking if the statement is an initialization

            if(flag == 0):
            #if(decflag == 0):
                for i in dict["VB"]:
                    for ini in initialize:
                        if ini in i:
                            flag = 1
                            #print("hermoine")
                            #iniflag = 1                          
                            self.init(dict)
                            break
                    #if iniflag == 1:
                    if flag == 1:
                        break

            #Printing output to screen
            if(flag == 0):
            #if(decflag == 0 and iniflag == 0):
                for i in dict["NN"] or dict["VB"]:
                    for p in printer:
                        if p in i:
                            flag =1
                            #pflag = 1
                            self.prin(dict)
                            break
                    if flag == 1:
                        #if pflag == 1:
                        break

            #Scanning input from screen
            if(flag == 0):
            #if(decflag == 0 and iniflag == 0 and pflag == 0):
                for i in dict["VB"] or dict["NN"]:
                    for s in scanner:
                        if s in i:
                            #dict["VB"].remove(i)
                            flag = 1
                            #scanflag = 1
                            self.scan(dict)
                            break
                    #if scanflag == 1:
                    if flag == 1:
                        break
                    #if scanflag == 0:
                    if flag == 0:
                        if("take from user" in a or "take from screen" in a or "take in" in a):
                            self.scan(dict)
                            flag = 1
                            #scanflag = 1
							
			#if statement
			if flag == 0:
					if "if" in c:
						self.cond(c)
						flag = 1

			#else statement
			if flag == 0:
				for i in otherwise:
					if i in c:
						text3.append("else")
						code_file.write("else")
						flag = 1
							
							
							
			
			if(flag == 0):
				if "continue" in a:
					text3.append("continue;")
						code_file.write("continue;\n")
						flag = 1





            #Looping
            if(flag == 0):
                for i in loop:
                    if i in a:
                        flag = 1
                        self.loop(dict)
                        break



            #arithmetic operations
            #if(decflag==0 and iniflag==0 and pflag==0 and scanflag == 0):
            if(flag == 0):
                for i in dict["VB"] + dict["NN"] + dict["CC"]:
                    for art in arithmetic:
                        #print(a,i)
                        if art in i:
                            flag = 1
                            #aflag = 1
                            self.arithmo(dict)
                            break
                        if flag == 1:
                        #if aflag==1:
                            break
Пример #26
0
from nltk.tag.stanford import StanfordPOSTagger 
import string
#import tensorflow_hub as hub
from bert_embedding import BertEmbedding

bert = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_multilingual_cased')
#print(bert('espanol'))
#sentence = ("autralia espanol").split(' ')
#embed = bert(sentence)
#first_word = embed[0]
#print(first_word[1])
# Assignment 4: NER
# This is just to help you get going. Feel free to
# add to or modify any part of it.

tagger = StanfordPOSTagger('stanford-models/models/spanish.tagger', 'stanford-models/stanford-postagger.jar')
punctuations = string.punctuation

def get_bert_embeddings(sent):
    print('yep')

def get_pos_tags(sent):
    return 0

def getfeats(word, o,tag):
    """ This takes the word in question and
    the offset with respect to the instance
    word """
    #tagger = conll2002.tagged_words()
    #print(spanish_postagger.tag(word))
    #print('the tag for '+word+' is '+tag)
Пример #27
0
import pandas as pd
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.tag.stanford import StanfordPOSTagger
from text_locations import transcripts

java_path = "usr/bin/java"
os.environ['JAVAHOME'] = java_path
current_dir = os.path.dirname(os.path.abspath(__file__))

stanford_parser_dir = current_dir + '/stanford_NLP/stanford-postagger-full-2015-04-20'
path_to_model = stanford_parser_dir + "/models/english-bidirectional-distsim.tagger"
path_to_jar = stanford_parser_dir + "/stanford-postagger.jar"
tagger = StanfordPOSTagger(path_to_model, path_to_jar)

POS_TAGS = [
    "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN",
    "NNS", "NNP", "NNPS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS",
    "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT",
    "WP", "WP$", "WRB"
]


# Save document from dataframe to CSV
def save_csv_file(document_df, database_location):
    cols = list(document_df.columns.values)
    cols.insert(0, cols.pop())
    cols.insert(0, cols.pop())
import nltk
from nltk import *
nltk.internals.config_java(options='-xmx2G')
from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

# enter the path to your local Java JDK, under Windows, the path should look very similar to this example
java_path = "/Library/Java/JavaVirtualMachines/jdk-14.jdk/Contents/Home/bin/"
os.environ["JAVAHOME"] = java_path

# enter the paths to the Stanford POS Tagger .jar file as well as to the model to be used
stanford_dir = "/Users/josemedardotapiatellez/Downloads/stanford-tagger-4.0.0"
modelfile = stanford_dir+"/models/spanish-ud.tagger.props"
jarfile=stanford_dir+"/stanford-postagger.jar"

pos_tagger = StanfordPOSTagger(modelfile, jarfile)

# Tagging this one example sentence as a test:
# this small snippet of text lets you test whether the tagger is running before you attempt to run it on a locally
# stored file (see line 28)
text = "Just a small snippet of text to test the tagger."

# Tagging a locally stored plain text file:
# as soon as the example in line 22 is running ok, comment out that line (#) and comment in the next line and
# enter a path to a local file of your choice;
# the assumption made here is that the file is a plain text file with utf-8 encoding
# text = open("C:/Users/Public/projects/python101-2018/data/sample-text.txt").read()

# nltk word_tokenize() is used here to tokenize the text and assign it to a variable 'words'
words = nltk.word_tokenize(text)
# print(words)
Пример #29
0
 def __init__(self, model_filename, jarfile):
     self.model_filename = model_filename
     self.path_to_jar = jarfile
     self.tager = StanfordPOSTagger(model_filename=self.model_filename,
                                    path_to_jar=self.path_to_jar)
import sys
sys.path.append('/data/rumor_detection/rumor_detection')
import json
import logging
from os import listdir
from os.path import isdir, join, isfile
import threading
from nltk.tag.stanford import StanfordPOSTagger

from src.utils import config
from src.utils import text_utils

text_processor = text_utils.create_text_processor()
stanford_tagger = StanfordPOSTagger(
    model_filename=
    '../../libs/stanford_postagger/models/english-bidirectional-distsim.tagger',
    path_to_jar='../../libs/stanford_postagger/stanford-postagger.jar')

writer = open('../../data/interim/tweet_stanford_pos_tag.txt', 'w')


def load_data(data_path):

    for f in listdir(data_path):
        topic_dir = join(data_path, f)
        if isdir(topic_dir):
            rumor_dir = join(topic_dir, 'rumours')
            non_rumor_dir = join(topic_dir, 'non-rumours')

            read_topic_dir(rumor_dir)
            read_topic_dir(non_rumor_dir)