def pos_tag(sent):
	sent_pos = nltk.pos_tag(nltk.wordpunct_tokenize(sent.lower()))
	simplified = []
	for w, pos in sent_pos:
		if simplify_wsj_tag(pos):
			simplified.append( (w, simplify_wsj_tag(pos)) )
			simplified.append( (w, 'U') )
	return simplified
def otkloni_nepozeljne(all_words, koliko):
    tagged_sent = nltk.pos_tag(all_words)
    simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
    useful_words = [
        t[0] for t in simplified if t[0] != '"' and (ok(t[1]) or t[0] == '--')
    return useful_words[:koliko]
예제 #3
def pos_tag(text, simple=False):
    """ Tokenizes a given text and determines the pos-tags. Lowercases
        the text.

        text: string to be tokenized
        simple: boolean indicating weather to simplify the pos tags

        list of tuples of form (token, pos-tag)

    blob = TextBlob(text.lower())
    pos = blob.tags

    # simplify tags if requested
    if simple:
        simple_pos = []
        for word, tag in pos:
            new_tag = simplify_wsj_tag(tag)
            # simplification removes some tags
            # not allowed to use empty tag so use initial one
            if not new_tag:
                new_tag = tag
            simple_pos.append((word, new_tag))
        pos = simple_pos

    return pos
예제 #4
def simplify_chunk(chunk):
    if isinstance(chunk, Tree):
        return Tree(chunk.node, [simplify_chunk(c) for c in chunk])
    elif isinstance(chunk, tuple):
        word, tag = chunk
        return (word, simplify_wsj_tag(tag))
        return chunk
예제 #5
def simplify_chunk(chunk):
	if isinstance(chunk, Tree):
		return Tree(chunk.node, [simplify_chunk(c) for c in chunk])
	elif isinstance(chunk, tuple):
		word, tag = chunk
		return (word, simplify_wsj_tag(tag))
		return chunk
def pos_tokens(essay):
	#converts an essay into a bag of parts of speech
	text = nltk.word_tokenize(essay)
	tagged_text = nltk.pos_tag(text)
	simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_text]
	pos_tags_only =""  # how do I just inittialize a list, like this??
	for word in range(0,len(simplified)):
		pos_tags_only = pos_tags_only +" " + simplified[word][1]
	return pos_tags_only
예제 #7
def average_pos_entropy_storeys(model, filename):
  with open(filename, 'r') as f:
    entropies = []
    for l in f:
      word_list = nltk.word_tokenize(l)
      if len(word_list) > 1:
        simplified = [simplify_wsj_tag(t) for _, t in nltk.pos_tag(word_list)]
  return ((sum(entropies) - 0.0) / len(entropies), entropies)
예제 #8
def filter_words(tokens): #filter nouns and verbs
    t = nltk.pos_tag(tokens)
    filt = []
    for word, tag in t:
        simptag = simplify_wsj_tag(tag)
        if simptag == 'N':
        elif simptag == 'V':
    return filt
예제 #9
def average_pos_entropy(model, filename):
  '''Get the average entropy of sentences in this file'''
  with open(filename) as f:
    entropies = []
    lines =
    for l in nltk.sent_tokenize(lines):
     word_list = nltk.word_tokenize(l)
     if len(word_list) > 1:
      simplified = [simplify_wsj_tag(t) for _, t in nltk.pos_tag(word_list)]
  return ((sum(entropies) - 0.0) / len(entropies), entropies)
예제 #10
 def simplify_tag(tag):
     # Keep POS tag as it is useful: many terms contain possessive: "'s"
     if tag.lower() == 'pos':
         return 'POS'
         simple_wsj_tag = simplify_wsj_tag(tag)
     if not simple_wsj_tag:
         # Convert '' tags to 'EMPTY' as RegexpParser doesn't like the
         # empty wsj simplified tag.
         return 'EMPTY'
         return simple_wsj_tag
예제 #11
 def simplify_tag(tag):
     # Keep POS tag as it is useful: many terms contain possessive: "'s"
     if tag.lower() == 'pos':
         return 'POS'
         simple_wsj_tag = simplify_wsj_tag(tag)
     if not simple_wsj_tag:
         # Convert '' tags to 'EMPTY' as RegexpParser doesn't like the 
         # empty wsj simplified tag.
         return 'EMPTY'
         return simple_wsj_tag
예제 #12
def removeStopwords(sentence):
	'''Remove StopWords'''
	ret = []
	orig = []
	temp = nltk.word_tokenize(sentence)
	temp = nltk.pos_tag(temp)
	stmr = WordNetLemmatizer()
	temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
	sen = [ stmr.lemmatize(x.lower(),tag[0].lower()) for x,tag in temp if tag in ['N','NP','NUM','V','VD','VG','VN']]
	#sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
	return sen
예제 #13
def prepare_input(sentence):
    words = []
    sentences = nltk.sent_tokenize(sentence)
    for sent in sentences:
        words = words + nltk.word_tokenize(sent)
    pos = nltk.pos_tag(words)
    pos = [simplify_wsj_tag(tag) for word, tag in pos]
    words = [w.lower() for w in words]
    trigrams = nltk.trigrams(words)
    trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams]
    features = words + pos + trigrams
    features = dict((f, True) for f in features)
    return features
예제 #14
def prepare_input(sentence):
    words = []
    sentences = nltk.sent_tokenize(sentence)
    for sent in sentences:
        words = words + nltk.word_tokenize(sent)
    pos = nltk.pos_tag(words)
    pos = [simplify_wsj_tag(tag) for word, tag in pos]
    words = [w.lower() for w in words]
    trigrams = nltk.trigrams(words)
    trigrams = ['%s/%s/%s' % (i[0], i[1], i[2]) for i in trigrams]
    features = words + pos + trigrams
    features = dict((f, True) for f in features)
    return features
예제 #15
def removeStopwords(sentence):
	'''Remove Stop words and stem the sentence. It also splits the sentences into words before stemming. '''
	# TODO([email protected]) : Add part of speach to each word hence produceds
	ret = []
	orig = []
	temp = nltk.word_tokenize(sentence)
	temp = nltk.pos_tag(temp)
	stmr = WordNetLemmatizer()
	temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
	#sen = [ stmr.lemmatize(x.lower(),'n') for x,tag in temp if tag in ['N','NP','NUM']]
	sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
	return sen
예제 #16
    def process_tips(tips):

        tags = set()
        index = 0
        for tip in tips:
            tagged_tip = TipPosTagger.tag_text(tip)
            simplified = [(word, simplify_wsj_tag(tag)) for word, tag in
            for tagged_word in simplified:
            index += 1

예제 #17
    def process_tips(tips):

        tags = set()
        index = 0
        for tip in tips:
            tagged_tip = TipPosTagger.tag_text(tip)
            simplified = [(word, simplify_wsj_tag(tag))
                          for word, tag in tagged_tip]
            for tagged_word in simplified:
            index += 1

예제 #18
def removeStopwords(sentence):
    '''Remove StopWords'''
    ret = []
    orig = []
    temp = nltk.word_tokenize(sentence)
    temp = nltk.pos_tag(temp)
    stmr = WordNetLemmatizer()
    temp = [(word, simplify_wsj_tag(tag)) for word, tag in temp]
    sen = [
        stmr.lemmatize(x.lower(), tag[0].lower()) for x, tag in temp
        if tag in ['N', 'NP', 'NUM', 'V', 'VD', 'VG', 'VN']

    #sen = [ stmr.lemmatize(word.lower(),'v') for word in re.sub("[^\w]"," ",sentence).split() if word.lower() not in stopwords.words('english') ]
    return sen
예제 #19
파일: 프로젝트: fa97/cs4740
def read_file(file_object):
    lines = file_object.readlines()
    for line in lines:
        print "#######LINE#######"
        print line

        text = PunktWordTokenizer().tokenize(line)
        #text = nltk.wordpunct_tokenize(line)
        print "#######TEXT#######"
        print text

        STOP WORD
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in text if w[0].lower() not in stopwords]
        print "#######STOP WORD#######"
        print content

        tagged_sent = nltk.pos_tag(content)
        tagged_sent = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
        print "#######POS#######"
        print tagged_sent

        #tagged_sent = tuple(tagged_sent)
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stem_word = ""
        for wrd in tagged_sent:
            stem_word = stem_word + " " + stemmer.stem(wrd[0])
        print "#######STEMMING#######"
        print stem_word
        print tagged_sent
        lmtzr = WordNetLemmatizer()
        sent = ""
        for wrd in tagged_sent:
            sent = sent + " " + lmtzr.lemmatize(wrd[0])
        print "#######LEMMA"""""""
        print sent
예제 #20
def get_pos_dict(z):
    text = z[1]
    pos = []
    posd = defaultdict(int)
    # seems slower, despite the promising name
    # pos = batch_pos_tag(text)
    # pos = [tag for sent in batch_pos_tag(map(word_tokenize,
    #                                          sent_tokenize(text.strip())))
    #        for tag in sent]
    for s in sent_tokenize(text):
        wk = word_tokenize(s)
        pos += pos_tag(wk)

    for p in pos:
        posd[simplify_wsj_tag(p[1])] += 1
    return posd
예제 #21
def read_file(file_object):
    lines = file_object.readlines()
    for line in lines:
        print "#######LINE#######"
        print line

        text = PunktWordTokenizer().tokenize(line)
        #text = nltk.wordpunct_tokenize(line)
        print "#######TEXT#######"
        print text
        STOP WORD
        stopwords = nltk.corpus.stopwords.words('english')
        content = [w for w in text if w[0].lower() not in stopwords]
        print "#######STOP WORD#######"
        print content
        tagged_sent = nltk.pos_tag(content)
        tagged_sent = [(word, simplify_wsj_tag(tag))
                       for word, tag in tagged_sent]
        print "#######POS#######"
        print tagged_sent
        #tagged_sent = tuple(tagged_sent)
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stem_word = ""
        for wrd in tagged_sent:
            stem_word = stem_word + " " + stemmer.stem(wrd[0])
        print "#######STEMMING#######"
        print stem_word
        print tagged_sent
        lmtzr = WordNetLemmatizer()
        sent = ""
        for wrd in tagged_sent:
            sent = sent + " " + lmtzr.lemmatize(wrd[0])
        print "#######LEMMA" """"""
        print sent
예제 #22
파일: 프로젝트: fa97/cs4740
def process_word_context(entire_context):
    #remove punct
    cont_without_punct = entire_context.translate(string.maketrans('', ''), r'!"#$&\'()*+,-./:;<=>?@[\\]^_`{}~')
    cont_without_punct = " ".join(cont_without_punct.split())
    #pos tagging
    tagged_sent = [(word, simplify_wsj_tag(tag)) for word, tag in nltk.pos_tag(cont_without_punct.split(" "))]

    #stop words removal
    stopwords = nltk.corpus.stopwords.words('english')
    pos_tag_without_stopwords = [wrd for wrd in tagged_sent if wrd[0].lower() not in stopwords]

    for i, pos_tag_tuple in enumerate(pos_tag_without_stopwords):
        if pos_tag_tuple[0] == '%%':
            prev_context, target_word, next_context = pos_tag_without_stopwords[:i], pos_tag_without_stopwords[i + 1], \
                                                      pos_tag_without_stopwords[i + 3:]

    return prev_context, next_context
예제 #23
 def analyzePosAndSyllables(self, text):
     tags = nltk.pos_tag(nltk.word_tokenize(text))
     tags = [(word, simplify_wsj_tag(tag)) for word, tag in tags]
     adjadvtag = ['ADJ', 'ADV']
     verbtag = ['V', 'VD', 'VG', 'VN'] # leaving out modal verbs MOD
     nountag = ['N', 'NP'] # leaving out pronouns PRO
     for p in tags:
         syl = self.nsyl(p[0])[0]
         if p[1] in adjadvtag:
             self.nadjadv = self.nadjadv + 1
         if p[1] in verbtag:
             self.nverbs = self.nverbs + 1
         if p[1] in nountag:
             self.nnouns = self.nnouns + 1
         if syl > 0:
             self.nsyllables = self.nsyllables + 1
         if syl >= 3:
             self.npolysyllables = self.npolysyllables + 1
예제 #24
    def __get_features(self, text):
        Given a string, tokenize, tag, and return a normalized set of features.

        Returns { feature: <True>, ... }
        words = []
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            words = words + nltk.word_tokenize(sentence)
            pos = nltk.pos_tag(words)
            # TODO verify simplify_wsj_tag increases accuracy
            pos = [simplify_wsj_tag(tag) for word, tag in pos]
            words = [i.lower() for i in words]
            trigrams = nltk.trigrams(words)
            trigrams = ["%s/%s/%s" % (i[0], i[1], i[2]) for i in trigrams]
            features = words + pos + trigrams
            features = dict([(i, True) for i in features])
            return features
예제 #25
def make_wordlist():
    f = open(dict_location, 'r')
    dictionary =
    # Shortening the dictionary, starting around the start of the
    # lowercase letters in my unix dictionary. Otherwise the tokenize
    # takes too long. 
    dictwords = [w for w in nltk.word_tokenize(dictionary) if w != "'s" and w != "n't"]
    dictwords = dictwords[18300:]
    short_dict = []
    for i in xrange(len(dictwords)/10):
    words = []
    words = nltk.pos_tag(short_dict)
    # simplifying the part of speech of the words following advice at:
    # here:
    simple_words = [(word, simplify_wsj_tag(tag)) for word, tag in words]
    return simple_words
 def preprocess(self, text=''):
     Given some text, return a list of words that correspond to desired parts of speech
     Only want to consider: nouns, proper nouns,and verbs 
     (presentense, pastence verb, present participle, past participle
     #return only the words in the line
     tokenized_words = nltk.word_tokenize(text.lower())
     #assign part of speech to each word in text
     pos_tagged_words = nltk.pos_tag(tokenized_words)
     #use simplified tagging for less parts of speech
     simplified_tagged_text = [(word, simplify_wsj_tag(tag)) for word, tag in pos_tagged_words]
     keywords = []
     for (word,part_of_speech) in simplified_tagged_text:    
         whitelist = ['N','NP', 'V', 'VD', 'VG', 'VN']
         if part_of_speech in whitelist:
     return keywords
예제 #27
    def preprocess(self, text='',method=0):
        Given some text, return a list of words/tokens, after removing punctuations,
        stop words, and digits. There various methods that may be used, depending on the
        method flag, including part of speech tagging, and checking against various stop word

        #remove punctuation and digits
        out = text.translate(string.maketrans("",""), string.punctuation + string.digits)
        #return word tokens in the line
        tokenized_words = nltk.word_tokenize(out.lower())

        if method == 0:
            #indexes stopdict by first char of word if first char = letter
            keywords = [word for word in tokenized_words 
                        if ord(word[0]) in xrange(97,123) and word not in self.stopdict[word[0]]]

        elif method == 1:
            #use NLTK stopword list
            keywords = [word for word in tokenized_words
                        if word not in stopwords.words('english')]

        elif method == 2:
            #only keep words that correspond to desired parts of speech.
            #only want to consider: nouns, proper nouns,and verbs
            #(presentense, pastence verb, present participle, past participle

            tokenized_words = nltk.word_tokenize(text.lower())
            #assign part of speech to each word in text
            pos_tagged_words = nltk.pos_tag(tokenized_words)
            #use simplified tagging for less parts of speech
            simplified_tagged_text = [(word, simplify_wsj_tag(tag)) for word, tag in pos_tagged_words]
            keywords = []
            for (word,part_of_speech) in simplified_tagged_text:    
                whitelist = ['N','NP', 'V', 'VD', 'VG', 'VN']
                if part_of_speech in whitelist:
        return keywords
예제 #28
def preProcess(tweet, dicoSlang):

    preProcessedTokens = []
    tokens = tweet.split(" ")
    for token in tokens:
        if len(token) > 0:
            if token[0] != "@":
                if token[0] == "#":
                    token = token.replace("#", "")
                if token in dicoSlang:
                    newTokens = dicoSlang[token]
                    newTokensBis = newTokens.split(" ")
                    for newToken in newTokensBis:

    taggedData = nltk.pos_tag(preProcessedTokens)
    simplifiedData = [(word, simplify_wsj_tag(tag)) for word, tag in taggedData]

    return simplifiedData
예제 #29
 def stem_sentences(self, content):
     stemmed_dict = dict()
     stemmed_text_dict = []
     sentences = self.split_content_to_sentences(content)
     for sentence in sentences:
         tokenized_sentence = self.format_sentence(sentence)
         tagged_sent = nltk.pos_tag(tokenized_sentence)
         relevant_words_in_sentence = []
         # Use built-in simplified tags.
         simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]
         for toople in simplified:
             if toople[1]  in ['V', 'VD', 'VG', 'VN', 'ADJ', 'NP', 'N']:
         # Get the stems of each sentence
         wnl = nltk.WordNetLemmatizer()
         stemmed_sent = [wnl.lemmatize(word) for word in relevant_words_in_sentence]
         stemmed_dict[sentence] = stemmed_sent
         relevant_words_in_sentence = []
         stemmed_dict = dict()
     return stemmed_text_dict
예제 #30
    def preprocess(self, text=''):
        Given some text, return a list of words that correspond to desired parts of speech
        Only want to consider: nouns, proper nouns,and verbs 
        (presentense, pastence verb, present participle, past participle

        #return only the words in the line
        tokenized_words = nltk.word_tokenize(text.lower())
        #assign part of speech to each word in text
        pos_tagged_words = nltk.pos_tag(tokenized_words)
        #use simplified tagging for less parts of speech
        simplified_tagged_text = [(word, simplify_wsj_tag(tag))
                                  for word, tag in pos_tagged_words]

        keywords = []
        for (word, part_of_speech) in simplified_tagged_text:
            whitelist = ['N', 'NP', 'V', 'VD', 'VG', 'VN']
            if part_of_speech in whitelist:
        return keywords
예제 #31
def getFreqDistOfUsefulWords(sentences):
	wordFreq = {}
	for sent in sentences:
		sanitized_sent = ''.join(e for e in sent if e.isalnum() or e == ' ')
		tokens = nltk.word_tokenize(sanitized_sent)
		tagged_tokens = nltk.pos_tag(tokens)
		simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_tokens]
		words_req = [key for key, val in simplified if val not in noneed]

		for word in words_req:
			if word in wordFreq:
				wordFreq[word] += 1
				wordFreq[word] = 1

	wordFreqOD = OrderedDict(sorted(wordFreq.items(), key=lambda t: t[1], reverse=True))

	for key, value in wordFreqOD.items():
		print key + " : " + str(value)

	return wordFreqOD
예제 #32
def process_word_context(entire_context):
    #remove punct
    cont_without_punct = entire_context.translate(
        string.maketrans('', ''), r'!"#$&\'()*+,-./:;<=>?@[\\]^_`{}~')
    cont_without_punct = " ".join(cont_without_punct.split())
    #pos tagging
    tagged_sent = [(word, simplify_wsj_tag(tag))
                   for word, tag in nltk.pos_tag(cont_without_punct.split(" "))

    #stop words removal
    stopwords = nltk.corpus.stopwords.words('english')
    pos_tag_without_stopwords = [
        wrd for wrd in tagged_sent if wrd[0].lower() not in stopwords

    for i, pos_tag_tuple in enumerate(pos_tag_without_stopwords):
        if pos_tag_tuple[0] == '%%':
            prev_context, target_word, next_context = pos_tag_without_stopwords[:i], pos_tag_without_stopwords[i + 1], \
                                                      pos_tag_without_stopwords[i + 3:]

    return prev_context, next_context
예제 #33
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['-sC66z4SO3tR7nFCjfQwuQ'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['QnAzW6KMSciUcuJ20oI3Bw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['uKSX1n1RoAzGq4bV8GPHVg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['YKOvlBNkF4KpUP9q7x862w'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['aRkYtXfmEKYG-eTDf_qUsw'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['pwpl-rxwNRQdgqFz_-qMPg'])
# business_records = ETLUtils.filter_records(my_records, 'business_id', ['3oZcTGb_oDHGwZFiP-7kxQ'])

my_tips = [my_record['text'] for my_record in business_records]
# TipPosTagger.process_tips(my_tips[:1000])

my_text = "The burgers are very good. The service is bad." + \
          "It is a great place to go with friends. I went there with my wife."
my_tags = TipPosTagger.tag_text(my_text)
simp = [(my_word, simplify_wsj_tag(my_tag)) for my_word, my_tag in my_tags]

tokenizer ='tokenizers/punkt/english.pickle')
my_sentences = tokenizer.tokenize(my_text)

# print sent_words

tip_pos_tagger = TipPosTagger()

sorted_x = sorted(tip_pos_tagger.noun_dictionary.iteritems(),

예제 #34
def pos_tag_sentence(sent, simplify_tags=False):
    tagged = pos_tag(sent)
    if simplify_tags:
        tagged = [(word, simplify_wsj_tag(tag)) for word, tag in tagged]
    return tagged
예제 #35
def __part_of_speech__(word):
    print "Doing POS lookup for", word
    tagged_sent = nltk.pos_tag([word])
    return simplify_wsj_tag(tagged_sent[0][1])
예제 #36
def tagging(x):
        tokens = get_tokens(x)
        tagged = nltk.pos_tag(tokens)
        simple = ["/".join([word,simplify_wsj_tag(tag)]) for word,tag in tagged]
        y = " ".join(simple)
        return y
예제 #37
	tagged_sents = tagged_corpus.tagged_posts(**kwargs)
	if isinstance(tagged_corpus, IndianCorpusReader) and not fileids:
		fileids = 'hindi.pos'
	if fileids and fileids in tagged_corpus.fileids():
		kwargs['fileids'] = [fileids]
		if args.trace:
			print 'using tagged sentences from %s' % fileids
	tagged_sents = tagged_corpus.tagged_sents(**kwargs)

# manual simplification is needed for these corpora
if args.simplify_tags and args.corpus in ['conll2000', 'switchboard']:
	tagged_sents = [[(word, simplify_wsj_tag(tag)) for (word, tag) in sent] for sent in tagged_sents]

## tagged sents ##

# can't trust corpus to provide valid list of sents (indian)
tagged_sents = [sent for sent in tagged_sents if sent]
nsents = len(tagged_sents)

if args.fraction == 1.0:
	train_sents = test_sents = tagged_sents
	cutoff = int(math.ceil(nsents * args.fraction))
	train_sents = tagged_sents[:cutoff]
	test_sents = tagged_sents[cutoff:]
wc = 0
tag_counts = FreqDist()
taglen = 7
word_set = set()

if args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
    kwargs = {'simplify_tags': True}
    kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
    if len(tag) > taglen:
        taglen = len(tag)

    if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
        tag = simplify_wsj_tag(tag)

    wc += 1
    # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
    if not isinstance(tag, basestring): tag = str(tag)

## output ##

print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set),

if args.sort == 'tag':
예제 #39
def tag(tokens):
    tagged_sent = nltk.pos_tag(tokens)
    simplified = [simplify_wsj_tag(tag) for word, tag in tagged_sent]
    return simplified
예제 #40
	def tag_pos(self,tweet):
		entity = {}					#entity for the current tweet
		topic = self.search_for_location_ref_tweet	#search for location references by regex match
		#weigh the hashtags, WEIGHTAGE = 6
		hashtags = self.hashtag.findall(tweet)
		for tag in hashtags:
			entity[tag] = 6 if tag not in entity else entity[tag] + 6
		#weigh the discounts, WEIGHTAGE = 1
		discounts =
		for discount in discounts:
			entity[discount] = 1 if discount not in entity else entity[discount] + 1
		#replace placeholders for tweets
		tweet = self.sub_placeholders(tweet)

		text = nltk.wordpunct_tokenize(tweet)
		tokens = nltk.pos_tag(text)
		simplified_tokens = [(word, simplify_wsj_tag(tag)) for word, tag in tokens]
		topic = ""
		for i in range(0,len(simplified_tokens)):
			#backword lookup on the basis of certain keywords
			if simplified_tokens[i][0].lower() == 'sale' or simplified_tokens[i][0].lower() == 'sales' or simplified_tokens[i][0].lower() == 'deal' or simplified_tokens[i][0].lower() == 'deals':
				j = i-1
				found = False
				while (j >= 0 and not(found)):
					#look for nouns. Once a nound is found, scan for all the immediately preceded nouns, else stop. these words get a weightage of 4 (higher)
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N') and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
						word = simplified_tokens[j][0].lower()
						if  j-1 >= 0:
							scan = True
							while j >= 0 and (scan):
								if (simplified_tokens[j-1][1] == 'NP' or simplified_tokens[j-1][1] == 'N' or simplified_tokens[j-1][1] == 'NUM' or simplified_tokens[j-1][0] == "'" or simplified_tokens[j-1][0] == "-") and simplified_tokens[j-1][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
									word = simplified_tokens[j-1][0].lower() + " " + word
									j = j-1
									scan = False
						if word!="":
							entity[word] = 4 if word not in entity else entity[word] + 4
							found = True
					j = j-1
			#forward lookup on the basis of certain obvious prepositions/noun/conjunction
			if simplified_tokens[i][0].lower() == 'off' or simplified_tokens[i][0].lower() == 'on' or simplified_tokens[i][0].lower() == 'at' or simplified_tokens[i][0].lower() == 'with' or simplified_tokens[i][0].lower() == 'deal':
				#if location reference, give a weightage of 3
				weightage = 6 if simplified_tokens[i][0].lower() == 'at' else 3
				found = False
				word =""
				while(j < len(simplified_tokens) and not(found)):
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N' ) and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','on','rt','sale','.'):
						word = simplified_tokens[j][0].lower()
						j = j + 1
						if  j < len(simplified_tokens):
							scan = True
							while j < len(simplified_tokens) and (scan) :
								if (simplified_tokens[j][1] == 'NP' or  simplified_tokens[j][1] == 'N' or  simplified_tokens[j][1] == 'NUM' or simplified_tokens[j][0] == "-" or simplified_tokens[j][0] == ".") and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt','sale') :
									#if a . is found, look if the previous word is one lettered. Can be an abbreviation like j.crew
									if simplified_tokens[j][1] == '.':
										if len(simplified_tokens[j-1][0])==1 and j+1<len(simplified_tokens):
											word = word + "." +simplified_tokens[j+1][0]
											scan = False
									word = word + " " + simplified_tokens[j][0].lower()
									j = j + 1
									if word =='':	#scan till a word is found
						if word!="":
							entity[word] = weightage if word not in entity else entity[word] + weightage
							found = True
					j = j+1
		#check if the word identified as topic is in the global list of topics. If it is boost the scores to cluster them into one category
		for key in entity.keys():
			if key in self.global_topics:
				entity[key] = entity[key] + 10
			if key in self.global_topics_not:
				entity[key] = 0
		entity = sorted(entity.iteritems(), key=operator.itemgetter(1), reverse=True)	#sort the topics in descending order of their weights
		for i in range(0,len(entity)):
			topic = entity[i][0]
			if topic not in self.global_topics_not:
				#adjust the ' in the the topic list like gaurav's or can't ..etc
				if (len(topic.split(" "))>3):
					words = topic.split(" ")
					if words[len(words)-2] == "'":
						topic = words[len(words)-3] + " " + words[len(words)-2] + words[len(words)-1]
					elif words[len(words)-1] == "'":
						topic = words[len(words)-3] + " " + words[len(words)-2]
						topic = words[len(words)-2] + " " + words[len(words)-1]
				#had to adjust the topic for 2 words specifically.
				elif (len(topic.split(" "))>2):
					words = topic.split(" ")
					for i in range(0,len(words)):
						if i == 0:
							topic = words[i]
						elif words[i-1] == "'":
							topic = topic + words[i]
						elif words[i] == "'":
							topic = topic + words[i]
							topic = topic + " " +words[i]
				if topic not in self.global_topics_not and topic!='' and topic!=' ':
					if topic not in self.global_topics:
						self.global_topics[topic] = 1
		return topic
예제 #41
	def tag_pos(self,tweet):
		entity = {}
		hashtags = self.hashtag.findall(tweet)
		for tag in hashtags:
			entity[tag] = 2 if tag not in entity else entity[tag] + 2
		tweet = self.hashtag.sub('hashtag123 ',tweet)
		tweet = self.url.sub('url123 ', tweet)
		discounts =
		for discount in discounts:
			entity[discount] = 1 if discount not in entity else entity[discount] + 1
		tweet ='discount123 ',tweet)
		tweet = self.person.sub('person123 ',tweet)
		text = nltk.wordpunct_tokenize(tweet)
		tokens = nltk.pos_tag(text)
		simplified_tokens = [(word, simplify_wsj_tag(tag)) for word, tag in tokens]
		topic = ""
		for i in range(0,len(simplified_tokens)):
			if simplified_tokens[i][0].lower() == 'sale':
				j = i-1
				found = False
				while (j > 0 and not(found)):
					if (simplified_tokens[j][1]=='NP' or simplified_tokens[j][1]=='N') and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
						word = simplified_tokens[j][0].lower()
						if  j-1 > 0:
							scan = True
							while j > 0 and (scan):
								if (simplified_tokens[j-1][1] == 'NP' or simplified_tokens[j-1][1] == 'N' or simplified_tokens[j-1][1] == 'NUM' or simplified_tokens[j-1][0] == "'" or simplified_tokens[j-1][0] == "-") and simplified_tokens[j-1][0].lower() not in ('url123','hashtag123','discount123','person123','rt'):
									word = simplified_tokens[j-1][0].lower() + " " + word
									j = j-1
									scan = False
						if word!="":
							entity[word] = 4 if word not in entity else entity[word] + 4
							found = True
					j = j-1
			if simplified_tokens[i][0].lower() == 'off' or simplified_tokens[i][0].lower() == 'on' or simplified_tokens[i][0].lower() == 'at':
				found = False
				while(j < len(simplified_tokens) and not(found)):
					if simplified_tokens[j][1]=='NP' and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','on','.','rt'):
						word = simplified_tokens[j][0].lower()
						j = j+1
						if  j < len(simplified_tokens):
							scan = True
							while j < len(simplified_tokens) and (scan) :
								if (simplified_tokens[j][1] == 'NP' or  simplified_tokens[j][1] == 'N' or  simplified_tokens[j][1] == 'NUM' or simplified_tokens[j][0] == "-") and simplified_tokens[j][0].lower() not in ('url123','hashtag123','discount123','person123','.','rt') :
									word = word + " " + simplified_tokens[j][0].lower()
									j = j + 1
									if word == "":
										scan = False
						if word!="":
							entity[word] = 3 if word not in entity else entity[word] + 3
							found = True
					j = j+1
		for key in entity.keys():
			if key in self.global_topics:
				entity[key] = entity[key] + 10
		entity = sorted(entity.iteritems(), key=operator.itemgetter(1), reverse=True)
		#print entity
		if len(entity) > 0:
			topic = entity[0][0]
			if (len(topic.split(" "))>3):
				for i in range(1,len(entity)):
					if len(entity[i][0].split(" "))<3:
						topic = entity[i][0]
			if topic not in self.global_topics:
				self.global_topics[topic] = 1
			return topic
예제 #42

wc = 0
tag_counts = FreqDist()
word_set = set()

if args.corpus in ['conll2000', 'switchboard']:
	kwargs = {}
elif args.simplify_tags:
	kwargs = {'simplify_tags': True}
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if args.corpus in ['conll2000', 'switchboard'] and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	wc += 1
	# loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)

## output ##

print '%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts))

if args.sort == 'tag':
	sort_key = lambda (t, c): t
예제 #43
def get_words_tags(str):
	text = nltk.word_tokenize(str)
	word_tags = nltk.pos_tag(text)
	return [(word, simplify_wsj_tag(tag)) for word, tag in word_tags]
예제 #44
def getSentimentFromTweet7(tweet, dico, listNeg, listBoost, listPosEmoticons, listNegEmoticons, swn, dicoSlang):
    input: tweet = string ; dico = dictionnary which links PoS of nltk to PoS of synsets, swn = sentiwordnet corpus reader, listNeg and listBoost
    return: posScore,negScore, sentiment = an int equal to 0 (negative), 2(neutral) or 4(positive) 
    Summary: Only the best neg and best pos are taken into account

    tokenizedData = tweet.split(" ")
    PreProcessedTokenizedData = preProcess(tokenizedData, dicoSlang)
    taggedData = nltk.pos_tag(PreProcessedTokenizedData)
    simplifiedData = [(word, simplify_wsj_tag(tag)) for word, tag in taggedData]

    posScore = 0
    negScore = 0
    boosterWord = False

    for index, couple in enumerate(simplifiedData):
        synset = None
        if couple[0] in listPosEmoticons:
            if posScore < 1:
                posScore = 1
        elif couple[0] in listNegEmoticons:
            if negScore < 1:
                negScore = 1
        elif couple[1] in dico:
            synsets = swn.senti_synsets(couple[0])
            for elem in synsets:
                if elem.synset.pos == dico[couple[1]]:
                    synset = elem
            if synset != None:
                if synset.neg_score == synset.pos_score:
                    boosterWord = True
                elif index > 0:
                    coupleBefore = simplifiedData[(index - 1)]
                    wordBefore = coupleBefore[0]
                    if wordBefore in listNeg:
                        if posScore < synset.neg_score:
                            posScore = synset.neg_score
                        if negScore < synset.pos_score:
                            negScore = synset.pos_score
                    elif (wordBefore in listBoost) or (boosterWord):
                        if index > 1:
                            coupleBeforeBis = simplifiedData[(index - 2)]
                            wordBeforeBis = coupleBeforeBis[0]
                            if wordBeforeBis in listNeg:
                                if posScore < (synset.neg_score + 1):
                                    posScore = synset.neg_score + 1
                                if negScore < (synset.pos_score + 1):
                                    negScore = synset.pos_score + 1
                                if posScore < (synset.pos_score + 1):
                                    posScore = synset.pos_score + 1
                                if negScore < (synset.neg_score + 1):
                                    negScore = synset.neg_score + 1
                            if posScore < (synset.pos_score + 1):
                                posScore = synset.pos_score + 1
                            if negScore < (synset.neg_score + 1):
                                negScore = synset.neg_score + 1
                        if index > 1:
                            coupleBeforeBis = simplifiedData[(index - 2)]
                            wordBeforeBis = coupleBeforeBis[0]
                            if wordBeforeBis in listNeg:
                                if posScore < synset.neg_score:
                                    posScore = synset.neg_score
                                if negScore < synset.pos_score:
                                    negScore = synset.pos_score
                                if posScore < synset.pos_score:
                                    posScore = synset.pos_score
                                if negScore < synset.neg_score:
                                    negScore = synset.neg_score
                            if posScore < synset.pos_score:
                                posScore = synset.pos_score
                            if negScore < synset.neg_score:
                                negScore = synset.neg_score
                    boosterWord = False
                    if posScore < synset.pos_score:
                        posScore = synset.pos_score
                    if negScore < synset.neg_score:
                        negScore = synset.neg_score
    if posScore > negScore:
        sentiment = 4
    elif posScore == negScore:
        sentiment = 2
        sentiment = 0

    return [posScore, negScore, sentiment]
예제 #45
 def tag(self, sentence):
     tokens = nltk.word_tokenize(sentence)
     tagged = nltk.pos_tag(tokens)
     simpl_tagged = [(word, simplify_wsj_tag(tag)) for word, tag in tagged]
     return simpl_tagged
예제 #46
verb = []
adjective = []

#For every line read in the file. 
for line in lines:
	c = []	 #An empty list that will be added to the main list. 2d List.
	d = []
	e = []

	tokens = nltk.word_tokenize(line)  #Tokenizing by word.
	tagged = nltk.pos_tag(tokens)

	simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged] #To create simplified POS tags.
	print simplified
	num = len(simplified)

	for i in xrange(num):
		# print simplified[i][0] ,   #Testing purpose.
		# print simplified[i][1]
		if simplified[i][0] not in useless:
			check(count,simplified[i]) #Function call
	count = count + 1	

#For sentences without any specific POS, we are appending NA to the list corresponding to the sentence. 
for li in noun:
	if (len(li) == 0):
예제 #47
파일: 프로젝트: anoukv/coconut
def postag(word):
		return simplify_wsj_tag(pos_tag([word])[0][1])
		return "UNK"
예제 #48
파일: 프로젝트: rsteckel/EDA
for u in f.lexUnit:
    print u


from pattern.en import wordnet

[x for x in f.FE]

all_lu = set()
for f in fn.frames():    
    lus = [ lu.split('.')[1] for lu in fn.frame(f.ID).lexUnit ]
    for lu in lus:

import nltk
from nltk.tag.simplify import simplify_wsj_tag

from nltk import simple

tagged_sent = nltk.pos_tag(tokens)
simplified = [(word, simplify_wsj_tag(tag)) for word, tag in tagged_sent]