예제 #1
0
def pos_tag(pos_type, tokenized_sent):
	if pos_type == 'unigram':
		brown_train = pickle.load(open('res/brown_train.pkl', 'rb'))
		unigram_tagger = UnigramTagger(brown_train)
		return unigram_tagger.tag(tokenized_sent)
	elif pos_type == 'max_pos':
		return nltk.pos_tag(tokenized_sent)		
예제 #2
0
def tag_essay(path_to_file, simplify="F"):

    with open(path_to_file, "r") as fin:
        tokenized_essay = fin.read().split()
        # tokenized_essay = nltk.word_tokenize(fin.read())

    tokenized_essay = [
        word.lower().replace(",", "").replace(".", "").replace(
            "?", "").replace("!", "").replace(";",
                                              "").replace(":",
                                                          "").replace('"', "")
        for word in tokenized_essay
    ]

    tagger = UnigramTagger(brown.tagged_sents())

    with open(path_to_file + ".pos", "w") as fout:

        tagged_words = tagger.tag(tokenized_essay)

        if simplify == "T": tagged_words = simplify_tagset(tagged_words)

        nword = 0
        typos = 0
        for word, tag in tagged_words:
            if word != "":
                nword += 1
                if tag == "None": typos += 1
                fout.write(word + ' , ' + str(tag) + "\n")

    with open(path_to_file + ".typos", "w") as fout:
        fout.write("{} , {}".format(typos, typos / nword))

    print("finish {}".format(path_to_file))
	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
예제 #4
0
class FeaturesetExtractor():

    def __init__(self):
        self.neg_words = [line.rstrip('\n') for line in open(NEG_WORD)]
        self.pos_words = [line.rstrip('\n') for line in open(POS_WORD)]
        self.anger_words = [line.rstrip('\n') for line in open(ANGER_WORD)]
        self.fear_words = [line.rstrip('\n') for line in open(FEAR_WORD)]
        self.happy_words = [line.rstrip('\n') for line in open(NEG_WORD)]
        self.sad_words = [line.rstrip('\n') for line in open(SAD_WORD)]
        self.tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500])
        
    def get_featureset(self, data_element):
        mapFeatureset = {}
        size = len(data_element.clean_text)
        word = data_element.clean_text
        list_word = word.split(" ")
        raw = data_element.raw_text
        list_word_raw = raw.split(" ")
        
        tot_pos_words = len(set(list_word) & set(self.pos_words))
        tot_neg_words = len(set(list_word) & set(self.neg_words))
        
        list_anger = tuple(set(list_word) & set(self.anger_words))
        list_fear = tuple(set(list_word) & set(self.fear_words))
        list_happy = tuple(set(list_word) & set(self.happy_words))
        list_sad = tuple(set(list_word) & set(self.sad_words))

        exclamation_count = raw.count("!")
        question_count = raw.count("?")
        uppercase_count = sum(1 for c in raw if c.isupper())

        mapFeatureset["bias"] = 1
        mapFeatureset["word"] = tuple(list_word)
        mapFeatureset["neg_words"] = tot_neg_words
        mapFeatureset["pos_words"] = tot_pos_words
        mapFeatureset["exclamation_count"] = exclamation_count
        mapFeatureset["question_count"] = question_count
        mapFeatureset["list_happy"] = list_happy
        mapFeatureset["list_sad"] = list_sad
        mapFeatureset["list_fear"] = list_fear
        mapFeatureset["list_anger"] = list_anger
        
        pos_tag_temp = self.tagger.tag((word).split(" "))
        list_pos_tag = []
        for element in pos_tag_temp:
            list_pos_tag.append(element[1])
        mapFeatureset["pos_tag"] = tuple(list_pos_tag)
        
        return mapFeatureset   
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
예제 #6
0
def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
예제 #7
0
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
예제 #9
0
def get_words_simple(text_string):
    """
    Gets a list of tagged words from an input string
    using whitespace-based tokenisation and a unigram PoS tagger
    """
    # get trained Unigram tagger
    print('Loading unigram tagger...')
    train_sents = treebank.tagged_sents()
    unigram_tagger = UnigramTagger(train_sents)
    # stripping punctuation
    # string.translate() takes a dictionary as input.
    # The dictionary mapping ordinal chars to None is created in place:
    text_string = text_string.translate(
        {ord(c): None
         for c in CHARS_TO_DELETE})
    words = text_string.split()  # crude tokenisation, keeps contractions
    english_stops = stopwords.words('english')
    stops_set = set(english_stops + ADDITIONAL_STOPS)
    cleaned_words = []
    for w in words:
        if w not in stops_set and w not in string.punctuation:
            cleaned_words.append(w)
    return unigram_tagger.tag(cleaned_words)
예제 #10
0
class PyTenseShift(object):

    """Initialization of PyTenseShift objects.
    
    The important part when you use the PlPyTenseShift is that
    we allow you to implmenent your own Tagger to optimize your
    results in translating from present to past tense. So, you need
    to implement the taggerinterface and change the second line of
    this code
    """
    def __init__(self, corpus, isPl):
        if isPl:
            self.tagger = FirstTagger(corpus)
        else:
            dtag = DefaultTagger("NN")
            self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)

    """ Tokenize the input sentence into words.
    This kind of representation is better to evaluate.
    
    """
    def _tokenize(self, tense, isPl):
        if isPl:
            return self.tagger.tag(tense)
        else:
            return self.__utag.tag(tokenize(tense))

    def getPastTense(self, tense):
        """Translates sentence given in present tense into past tense 
        
        Args:
            sentence (str): Sentence to translate
        Returns:
            str. Sentence in past tense
        """
        raise NotImplementedError("abstract method")
예제 #11
0
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger

tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John', 'and', 'Smith', 'went', 'to', 'NY', 'and', 'Germany']
for word, tag in tagger.tag(sentence):
    print(word, '->', tag)
예제 #12
0
class FirstTagger(TaggerInterface):
    
    def __init__(self, corpus):
        dtag = DefaultTagger("NN")
        self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
        
    def tag(self, tense):
        """Does translation from tag generated by tagger into unified format
        
            Args:
                sentence: list of touple (word and its form) which are after verb
            Returns:
                list of touple (word and its form in unified format)
        """
        words = self.__utag.tag(tokenize(tense))
        
        for i, (word, form) in enumerate(words):
            word_info = {}
            
            if form[0] == 'V': word_info['klasa'] = 'czasownik'
            elif form[0] == 'S': word_info['klasa'] = 'rzeczownik'
            elif form[0] == 'A': word_info['klasa'] = 'przymiotnik'
            elif form[0] == 'N': word_info['klasa'] = 'liczebnik'
            elif form[0] == 'Z': word_info['klasa'] = 'zaimek'
            elif form[0] == 'D': word_info['klasa'] = 'przysłówek'
            elif form[0] == 'P': word_info['klasa'] = 'przyimek'
            elif form[0] == 'C': word_info['klasa'] = 'spójnik'
            elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik'
            elif form[0] == 'T': word_info['klasa'] = 'partykuła'
            else: word_info['klasa'] = 'nieznany'
            
            if form[1] == 'S': word_info['liczba'] = 'pojedyńcza'
            elif form[1] == 'P': word_info['liczba'] = 'mnoga'
            
            if(len(form) >= 3):
                if form[2] == 'N': word_info['przypadek'] = 'mianownik'
                elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz'
                elif form[2] == 'D': word_info['przypadek'] = 'celownik'
                elif form[2] == 'A': word_info['przypadek'] = 'biernik'
                elif form[2] == 'I': word_info['przypadek'] = 'narzędnik'
                elif form[2] == 'L': word_info['przypadek'] = 'miejscownik'
                elif form[2] == 'V': word_info['przypadek'] = 'wołacz'
            
            if(len(form) >= 4):
                if form[3] == 'M': word_info['rodzaj'] = 'm'
                elif form[3] == 'P': word_info['rodzaj'] = 'm'
                elif form[3] == 'A': word_info['rodzaj'] = 'm'
                elif form[3] == 'I': word_info['rodzaj'] = 'm'
                elif form[3] == 'F': word_info['rodzaj'] = 'ż'
                elif form[3] == 'N': word_info['rodzaj'] = 'n'
                elif form[3] == 'O': word_info['rodzaj'] = 'm'
                elif form[3] == 'R': word_info['rodzaj'] = 'ż'
                elif form[3] == 'T': word_info['rodzaj'] = 'ż'
            if(len(form) >= 6):
                if form[5] == '1': word_info['osoba'] = 'pierwsza'
                elif form[5] == '2': word_info['osoba'] = 'druga'
                elif form[5] == '3': word_info['osoba'] = 'trzecia'
                elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik'
                elif form[5] == 'B': word_info['osoba'] = 'bezosobnik'
                elif form[5] == 'U': word_info['osoba'] = 'imiesłów'
                elif form[5] == 'W': word_info['osoba'] = 'imiesłów'
            if(len(form) >= 7):
                if form[6] == 'T': word_info['czas'] = 'teraźniejszy'
                elif form[6] == 'P': word_info['czas'] = 'przeszły'
                elif form[6] == 'F': word_info['czas'] = 'przyszły'
            if(len(form) >= 8):
                if form[7] == 'O': word_info['tryb'] = 'oznajmujący'
                elif form[7] == 'P': word_info['tryb'] = 'przypuszczający'
                elif form[7] == 'R': word_info['tryb'] = 'rozkazujący'
            if(len(form) >= 9):
                if form[8] == 'D': word_info['aspekt'] = 'dokonane'
                elif form[8] == 'N': word_info['aspekt'] = 'niedokonane'
            
            words[i] = (words[i][0], word_info)
        
        return words
예제 #13
0
파일: nltk.py 프로젝트: byouloh/sourcenet
#tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])

#--------------------------------------------------------------------------------
# Training taggers
#--------------------------------------------------------------------------------

# so far so good.  Next have to train taggers.

# Unigram, training on Treebank corpus
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
unigram_tagger = UnigramTagger(train_sents)

# try it on our word_list.
unigram_tagger.tag( word_list )

# Backoff taggers - hierarchy of taggers, first tags all it can, then next takes
#    a stab at all with tag of None, then next, etc.

# Unigram with Default as backoff:
train_sents = treebank.tagged_sents()
unigram_tagger = UnigramTagger( train_sents, backoff = default_tagger )

# Add in contextual taggers:
# - bigram - current word plus previous token.
# - trigram - current word plus previous two tokens.
from nltk.tag import BigramTagger, TrigramTagger
bitagger = BigramTagger( train_sents )
tritagger = TrigramTagger( train_sents )
예제 #14
0
def TaggerOnline(tokens):
    etiq1 = DefaultTagger('N')
    sentencas_treinadoras = mac_morpho.tagged_sents()[::]
    etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
    tagsTokens = etiq2.tag(tokens)
    return tagsTokens
예제 #15
0
rt.tag(nltk.word_tokenize(sentence))

#%%
## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# testing performance of unigram tagger
print('unigram tagger: ')
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sentence)))

# testing performance of bigram tagger
print('\nbigram tagger:')
print(bt.evaluate(test_data))
print(bt.tag(nltk.word_tokenize(sentence)))

# testing performance of trigram tagger
print('\ntrigram tagger:')
print(tt.evaluate(test_data))
print(tt.tag(nltk.word_tokenize(sentence)))


#%%
# combined tagger with a list of taggers and use a backoff tagger
def combined_tagger(train_data, taggers, backoff=None):
예제 #16
0
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
unitag = UnigramTagger(model={'Vinken': 'NN'})
print(unitag.tag(treebank.sents()[0]))
예제 #17
0
for word in cleaned_bow.columns:
    pos = pos_tag(list(word))[0][1]
    if pos != 'NN':
        nouns_bow = nouns_bow.drop(word, axis=1)

topic_words(NMF_vars(10, nouns_bow)[0], nouns_bow)

topic_words(
    NMF_vars(5, nouns_bow.drop('girl', axis=1))[0],
    nouns_bow.drop('girl', axis=1))

# try different tagger
nouns_bow_2 = cleaned_bow
tagger = UnigramTagger(brown.tagged_sents())
for word in cleaned_bow.columns:
    pos = tagger.tag(list(word))[0][1]
    if pos != 'NN':
        nouns_bow_2 = nouns_bow_2.drop(word, axis=1)

for num in range(2, 6):
    topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2)

topic_words(NMF_vars(10, nouns_bow_2)[0], nouns_bow_2)

# remove the word 'total'
topic_words(
    NMF_vars(10, nouns_bow_2.drop('total', axis=1))[0],
    nouns_bow_2.drop('total', axis=1))

for num in range(12, 16):
    topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2)
예제 #18
0
def main():
	brownDict={'.': 'sentence closer (. ; ? *)', '(': 'left paren', ')': 'right paren', '*': 'not, nt', '--': 'dash', ',': 'comma', ':': 'colon', 'ABL': 'pre-qualifier (quite, rather)', 'ABN': 'pre-quantifier (half, all)', 'ABX': 'pre-quantifier (both)', 'AP': 'post-determiner (many, several, next)', 'AT': 'article (a, the, no)', 'BE': 'be', 'BED': 'were', 'BEDZ': 'was', 'BEG': 'being', 'BEM': 'am', 'BEN': 'been', 'BER': 'are, art', 'BEZ': 'is', 'CC': 'coordinating conjunction (and, or)', 'CD': 'cardinal numeral (one, two, 2, etc.)', 'CS': 'subordinating conjunction (if, although)', 'DO': 'do', 'DOD': 'did', 'DOZ': 'does', 'DT': 'singular determiner/quantifier (this, that)', 'DTI': 'singular or plural determiner/quantifier (some, any)', 'DTS': 'plural determiner (these, those)', 'DTX': 'determiner/double conjunction (either)', 'EX': 'existential there', 'FW': 'foreign word (hyphenated before regular tag)', 'HV': 'have', 'HVD': 'had (past tense)', 'HVG': 'having', 'HVN': 'had (past participle)', 'IN': 'preposition', 'JJ': 'adjective', 'JJR': 'comparative adjective', 'JJS': 'semantically superlative adjective (chief, top)', 'JJT': 'morphologically superlative adjective (biggest)', 'MD': 'modal auxiliary (can, should, will)', 'NC': 'cited word (hyphenated after regular tag)', 'NN': 'singular or mass noun', 'NN$': 'possessive singular noun', 'NNS': 'plural noun', 'NNS$': 'possessive plural noun', 'NP': 'proper noun or part of name phrase', 'NP$': 'possessive proper noun', 'NPS': 'plural proper noun', 'NPS$': 'possessive plural proper noun', 'NR': 'adverbial noun (home, today, west)', 'OD': 'ordinal numeral (first, 2nd)', 'PN': 'nominal pronoun (everybody, nothing)', 'PN$': 'possessive nominal pronoun', 'PP$': 'possessive personal pronoun (my, our)', 'PP$$': 'second (nominal) possessive pronoun (mine, ours)', 'PPL': 'singular reflexive/intensive personal pronoun (myself)', 'PPLS': 'plural reflexive/intensive personal pronoun (ourselves)', 'PPO': 'objective personal pronoun (me, him, it, them)', 'PPS': '3rd. singular nominative pronoun (he, she, it, one)', 'PPSS': 'other nominative personal pronoun (I, we, they, you)', 'PRP': 'Personal pronoun', 'PRP$': 'Possessive pronoun', 'QL': 'qualifier (very, fairly)', 'QLP': 'post-qualifier (enough, indeed)', 'RB': 'adverb', 'RBR': 'comparative adverb', 'RBT': 'superlative adverb', 'RN': 'nominal adverb (here, then, indoors)', 'RP': 'adverb/particle (about, off, up)', 'TO': 'infinitive marker to', 'UH': 'interjection, exclamation', 'VB': 'verb, base form', 'VBD': 'verb, past tense', 'VBG': 'verb, present participle/gerund', 'VBN': 'verb, past participle', 'VBP': 'verb, non 3rd person, singular, present', 'VBZ': 'verb, 3rd. singular present', 'WDT': 'wh- determiner (what, which)', 'WP$': 'possessive wh- pronoun (whose)', 'WPO': 'objective wh- pronoun (whom, which, that)', 'WPS': 'nominative wh- pronoun (who, which, that)', 'WQL': 'wh- qualifier (how)', 'WRB': 'wh- adverb (how, where, when)'}

	sent = ['Marley', 'was', 'dead', ':', 'to', 'begin', 'with', '.', 'There', 'is', 'no', 'doubt', 'whatever', 'about', 'that', '.']
	
	# Part 1
	
	print("Brown tagger:")
	brownTagger = UnigramTagger(brown.tagged_sents())
	for word, tag in brownTagger.tag(sent):
		print(word,'->',tag)

	print("\nPENN Treebank Tagger:")
	pennTagger = UnigramTagger(treebank.tagged_sents())
	for word, tag in pennTagger.tag(sent):
		print(word, '->', tag)

	print("\nNLTK tagger:")
	nltkTagger = pos_tag(sent)
	for word, tag in nltkTagger:
		print(word, '->', tag)
		
	# Part 2

	br_tw = brown.tagged_words(categories='mystery')
	br_ts = brown.tagged_sents(categories='mystery')

	print("\nAnswer to 2A: \nWords: {} \nSentences: {}".format(len(br_tw), len(br_ts)))
	print("\nAnswer to 2B: \n100th word: {}, type is: {} \n101th word: {}, type is: {}".format(br_tw[99][0], brownDict.get(br_tw[99][1]), br_tw[100][0], brownDict.get(br_tw[100][1])))


	tagList=[]
	wordList=[]
	sentDict={}

	for sent in br_ts:
		for wordtag in sent:
			wordList.append(wordtag[0])
			tagList.append(brownDict.get(wordtag[1]))
			sentDict.setdefault(wordtag[1], [])
			sentDict[wordtag[1]].append(wordtag[0])


	print("\nAnswer to 2C: There are {} different tags being used.\n2D: 10 most common words are: \n{} \n2E: 10 most common tags are: \n {}".format(len(Counter(tagList)),Counter(wordList).most_common(10),Counter(tagList).most_common(10)))
	print("\nAnswer to 2F: Most common adverb (RB)= {} \n2G: Most common adjective (JJ)= {}".format(Counter(sentDict["RB"]).most_common(1),format(Counter(sentDict["JJ"]).most_common(1))))

	concDict={}
	tagTypes = []
	i=0
	tagTypesMeaning = []
	for word, tag in br_tw:
		concDict[i]=[i,str(tag),str(word)]
		i+=1
		if word == 'so':
			tagTypes.append(tag)
			tagTypesMeaning.append(brownDict.get(tag))
	tagTypesFreq = nltk.FreqDist(tagTypesMeaning)
	print("\nAnswer to 2H and 2I:\n{}".format(tagTypesFreq.most_common()))

	# 2k vanaf hier met CS QL RB:
	csplusList=[]
	csminList=[]
	qlplusList=[]
	qlminList=[]
	rbplusList=[]
	rbminList=[]
	csneighbourrightList=[]
	csneighbourleftList=[]
	qlneighbourrightList=[]
	qlneighbourleftList=[]
	rbneighbourrightList=[]
	rbneighbourleftList=[]
	for value in concDict.values():
		if value[2] == 'so' and value[1] == 'CS':
			csplusList.append(concDict.get(value[0]+1))
			csminList.append(concDict.get(value[0]-1))
		elif value[2] == 'so' and value[1] == 'QL':
			qlplusList.append(concDict.get(value[0]+1))
			qlminList.append(concDict.get(value[0]-1))
		elif value[2] == 'so' and value[1] == 'RB':
			rbplusList.append(concDict.get(value[0]+1))
			rbminList.append(concDict.get(value[0]-1))
	for item in csminList:
		csneighbourleftList.append(item[1])

	for item in csplusList:
		csneighbourrightList.append(item[1])

	for item in qlminList:
		qlneighbourleftList.append(item[1])

	for item in qlplusList:
		qlneighbourrightList.append(item[1])

	for item in rbminList:
		rbneighbourleftList.append(item[1])

	for item in rbplusList:
		rbneighbourrightList.append(item[1])

	uniqueList = []
	[uniqueList.append(tag) for tag in tagTypes if tag not in uniqueList]
	
	exampleList = []
	for sentence in br_ts:
		for word, tag in sentence:
			if word == 'so' and tag in uniqueList:
				sentenceStr = " ".join([w for w, t in sentence]) + "(" + brownDict.get(tag) + ")"
				exampleList.append(sentenceStr)
				uniqueList.remove(tag)

	print("\nAnswer to 2J:\n{}".format(exampleList))

	print("\nAnswer to 2K: \n Preceder of CS:{} \n Follower of CS: {}\n Preceder of QL: {}\n Follower of QL: {}\n Preceder of RB: {}\n Follower of RB: {}".format(brownDict.get(Counter(csneighbourleftList).most_common(1)[0][0]),brownDict.get(Counter(csneighbourrightList).most_common(1)[0][0]), brownDict.get(Counter(qlneighbourleftList).most_common(1)[0][0]), brownDict.get(Counter(qlneighbourrightList).most_common(1)[0][0]), brownDict.get(Counter(rbneighbourleftList).most_common(1)[0][0]), brownDict.get(Counter(rbneighbourrightList).most_common(1)[0][0])))

	
	# Part 3

	text = open('holmes.txt').read()[:500]
	tokens = nltk.wordpunct_tokenize(text)
	textTagged = pos_tag(tokens)
	print("\nPart 3, holmes.txt tokenized and POS-tagged:\n{}".format(textTagged))
예제 #19
0
    text += wikipedia.page(door).content

sents = text.split('\n')
    
# Each sentence is a list of words 
def preprocess(sents):
    sents = [sent.translate(str.maketrans('','', string.punctuation)).strip(string.digits).lower() for sent in sents]
    sents = [word_tokenize(sent) for sent in sents]
    return [[word for word in sent if word not in set(stopwords.words('english'))] for sent in sents]

# We will tag all the words with unique POS tags and later use these as labels for our classification task

tagger = UnigramTagger(brown.tagged_sents(categories='news'))
sents = preprocess(sents)
words = list(set([word for sent in sents for word in sent]))
pos_tags = dict(tagger.tag(words))
maxlengths=max([len(s) for s in sents])


# In[3]:

words_dic={word:i+1 for (i,word) in enumerate(words)}
words_tags_pairs=[tagger.tag(s) for s in sents]

y=[[w[1] for w in wp] for wp in words_tags_pairs]
X=[[words_dic[w[0]] for w in wp] for wp in words_tags_pairs]
X=[[x+[0]*(maxlengths-len(x))] for x in X]

all_tags=set([x for s in y for x in s])
all_tags_dic={t:i for (t,i) in zip (all_tags,range(1,len(all_tags)+1))}
all_tags_dic["eos"]=0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division
from nltk.tag import UnigramTagger


if __name__ == '__main__':
    model = {u'Péter': 'N', 'Enikő': 'N', 'szeret': 'V', 'Marit': 'Nacc'}
    tagger = UnigramTagger(model=model)

    print(tagger.tag(['Péter', 'Enikő', 'szeret', 'Marit']))


예제 #21
0
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

# train
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

print(treebank.sents()[0])
print(tagger.tag(treebank.sents()[0]))

# test
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

예제 #22
0
    poslist.append('positive')

#Creates a list of tuples, with sentiment tagged.
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)

#Combines all of the tagged tweets to one large list.
taggedtweets = postagged + negtagged
shuffle(taggedtweets)
tweets = []

#Create a list of words in the tweet, within a tuple.
for (word, sentiment) in taggedtweets:
    word_filter=tokenizer.tokenize(word)
    word_filter=AntonymReplacer().replace_negations(word_filter)
    dic=dict(tagger.tag(word_filter))
    
    word_lemma=[]
    
    for i in word_filter:
        if dic[i]==None:
            pass
        elif dic[i][0]=="V":
            word_lemma.append(lemmatizer.lemmatize(i, "v").lower())
        elif dic[i][0]=="N" or dic[i][0]=="ADJ" or dic[i][0]=="ADV":
            word_lemma.append(lemmatizer.lemmatize(i).lower()) 
    tweets.append((word_lemma, sentiment))


#Pull out all of the words in a list of tagged tweets, formatted in tuples.
def getwords(tweets):
예제 #23
0
# print [word for word in words if word not in english_stops]

#look up words and print synset
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
print syn.name()
print syn.definition()
print syn.hypernyms()
print syn.hypernyms()[0].hyponyms()
print syn.root_hypernyms()
print syn.hypernym_paths()

#
# for w in words:
#     print w
#     syn = wordnet.synsets(w)
#     if (type(syn) == 'list'):
#         syn = syn[0]
#     # print syn
#     if (len(syn) != 0):
#         for i in syn:
#             # print i
#             # print '\t[', i.name(),']'
#             print '\t--', i.definition()

from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
print tagger.tag(treebank.sents()[0])
예제 #24
0
######## UNIGRAM TAGGER ##########

from nltk.tag import UnigramTagger
from nltk.corpus import treebank

#We use the first 3000 sentences of the treebank corpus as the training set to initialize
#the UnigramTagger class
#Unigram tagger can be trained by giving it a list of tagged sentences at initialization.
train_sents=treebank.tagged_sents()[:3000]
tagger=UnigramTagger(train_sents)
print treebank.sents()[0]
print tagger.tag(treebank.sents()[0])

test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)



tagger=UnigramTagger(model={'Pierre':'NN'})
tagger.tag(treebank.sents())[0]
예제 #25
0
from nltk.corpus import brown
from nltk.tag import UnigramTagger
import cPickle as pickle

INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl"

def write_word_list(filename, word_list):
    out_fp = open(filename, "w")
    print >> out_fp, "\n".join(word_list)

if __name__ == '__main__':
    in_fp = open(INPUT_FILE, "rb") 
    words = pickle.load(in_fp)
    tagger = UnigramTagger(brown.tagged_sents())
    good_words = []
    for word in words:
        tag = tagger.tag([word])[0][1]
        if tag == None:
            continue
        if "NP" in tag:
            continue
        good_words.append(word)
    write_word_list("brown.txt", good_words)
예제 #26
0
import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John','and','Smith','went','to','NY','and','Germany']
for word, tag in tagger.tag(sentence):
    print(word,'->',tag)
예제 #27
0
    while not redisInterface.hasPending():
        sleep(1)

    page = redisInterface.popPending()
    print 'Reading ' + page + ' STARTED'

    # Read the html page
    with open(page, 'r') as htmlPage:
        data = htmlPage.read().replace('\n', '');

    # Parse html
    soup = BeautifulSoup(data)
    articleTitle = titleFromArticleSoup(soup)
    articleBodyWithTags = soup.find_all('p', class_ = 'story-body-text')
    articleBody = [stringFromHTMLParagraph(p)
            for p in articleBodyWithTags]
    parasToProcess = [articleTitle] + articleBody

    print 'Title: ' + articleTitle

    # Tokenize and tag
    tokens = [tokenizer.tokenize(s) for s in parasToProcess]
    taggedArticleBody = [tagger.tag(t) for t in tokens]

    # Save to redis
    redisInterface.saveArticleData(
            TaggedArticle(articleTitle, taggedArticleBody,'article_data'))

    print 'Reading ' + page + ' FINISHED'
예제 #28
0
from nltk.corpus import brown
from nltk.tag import UnigramTagger
import cPickle as pickle

INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl"


def write_word_list(filename, word_list):
    out_fp = open(filename, "w")
    print >> out_fp, "\n".join(word_list)


if __name__ == '__main__':
    in_fp = open(INPUT_FILE, "rb")
    words = pickle.load(in_fp)
    tagger = UnigramTagger(brown.tagged_sents())
    good_words = []
    for word in words:
        tag = tagger.tag([word])[0][1]
        if tag == None:
            continue
        if "NP" in tag:
            continue
        good_words.append(word)
    write_word_list("brown.txt", good_words)
예제 #29
0
        return myPhrases

#Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N)
etiqPadrao = DefaultTagger('N')
#Pega o trainning set a partir das tagged_sents() do mac_morpho
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000]
#Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho
etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao)

coment = str(input("Entre com o texto: "))
if coment == "default":
        coment = open("default.txt", "r").read().replace("\n", " ")
#O texto é convertido em tokens
tokens=nltk.word_tokenize(coment.lower())
#É etiquetada cada token do texto
tags = etiq.tag(tokens)

#É criado o analisador de expresões regulares contendo os padrões procurados
analiseGramatical = RegexpParser(r"""
		PADRAO7: {<N><ADJ>}
        PADRAO1: {<ADJ><N>(<PREP>?<N>)*}
        PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?}
        PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?}
        PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+}
        PADRAO5: {<ADV><V>}
        PADRAO6: {<V><ADV>}
		""")
#O analisador é então utilizado para a geração da árvore de padrões
arvore = analiseGramatical.parse(tags)
x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"),
     ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"),
예제 #30
0
import nltk
import json

from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(tagset='universal'))
sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
for word, tag in tagger.tag(sent):
	if tag == "VERB":
		print(word, '->', tag)


verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+')
with open("../assets/inputText/all_questions.txt", 'r') as all_lines:
	for line in all_lines:
		splitLine = line.split(' ')
		for word, tag in tagger.tag(splitLine):
			if tag == "VERB":
				verbs_tagged.write(word + "\n")
				#verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
				



예제 #31
0
print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)

print bt.evaluate(test_data)
print bt.tag(tokens)

print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))
예제 #33
0
def TaggerOnline(tokens):
	etiq1 = DefaultTagger('N')
	sentencas_treinadoras = mac_morpho.tagged_sents()[::]
	etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1)
	tagsTokens = etiq2.tag(tokens)
	return tagsTokens
예제 #34
0
#    Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes,
#    letters, characters or syllabes. Shingles: n-grams where items are just words.
#    UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger

# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data)))

print("\n1-Gram tags:")
print(ut.tag(tokens))

print("\n2-Gram tags:")
print(bt.tag(tokens))

print("\n3-Gram tags:")
print(tt.tag(tokens))

# Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams
# and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words
# do not always appear paired in the same way)

# 4. TAGGER CHAINING WITH BACKOFF TAGGERS:


# Function to chain a set of taggers, with a backoff tagger as last resource
예제 #35
0
'''
Created on Feb 24, 2012

@author: 100457636
'''
from nltk.corpus import brown
from nltk.tag import UnigramTagger

train_sents = brown.tagged_sents()

tagger  = UnigramTagger(train_sents)

print tagger.tag(["doing"]) 
예제 #36
0
from samples import sample

# Test and training variables
test_sents = treebank.tagged_sents()[3000:]
train_sents = treebank.tagged_sents()[:3000]
tk_sample = word_tokenize(sample)

# Default tagger - Nouns
df_tagger = DefaultTagger('NN')
tagged = df_tagger.tag(tk_sample)
accuracy = df_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Unigram tagger
ug_tagger = UnigramTagger(train_sents)
tagged = ug_tagger.tag(tk_sample)
accuracy = ug_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate
ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger)
accuracy = ugb_tagger.evaluate(test_sents)
print(f"Accuracy of backoff: {accuracy}\n")

# Saving pickle and testing it.
with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file:
    pickle.dump(ugb_tagger, file)

with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file:
    pk_tagger = pickle.load(file)
예제 #37
0
tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(brown_train_sents)

bigramTagger = BigramTagger(brown_train_sents)
trigramTagger = TrigramTagger(brown_train_sents)

print("------------Recommended Tagger------------")
print(nltk.pos_tag(sent))

print("------------Default Tagger------------")
print(defaultTagger.tag(sent))

print("------------Unigram Tagger Overrode------------")
unigramTagger = UnigramTagger(model={'Pierre': 'NN'})
print(unigramTagger.tag(sent))

print("------------Unigram Tagger Trained------------")
unigramTagger = UnigramTagger(brown_train_sents)
print(unigramTagger.tag(sent))

#cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger
print("------------Unigram Tagger Trained with cutoff=3------------")
unigramTagger = UnigramTagger(brown_train_sents, cutoff=3)
print(unigramTagger.tag(sent))

print("------------Bigram Tagger------------")
print(bigramTagger.tag(sent))

print("------------Trigram Tagger------------")
print(trigramTagger.tag(sent))