def getData(corpus="brown", categories=""): if corpus == "brown": if categories != "": return brown.tagged_sents(tagset='universal', categories=categories) return brown.tagged_sents(tagset='universal') elif corpus == "treebank": return treebank.tagged_sents(tagset='universal') elif corpus == "nps_chat": #Dialogue dataset data = [] posts = nps_chat.posts() words = nps_chat.tagged_words(tagset='universal') index = 0 for sent in posts: data.append(words[index:index + len(sent)]) index += len(sent) return data elif corpus == "conll2000": return conll2000.tagged_sents(tagset='universal') return brown.tagged_sents(tagset='universal')
def part_of_speech(sentence): chat_tagged = nps_chat.tagged_words(tagset="universal") tuple_list = chat_tagged word = [] category = [] for a_tuple in tuple_list: word.append(a_tuple[0]) for a_tuple in tuple_list: category.append(a_tuple[1]) df = pd.DataFrame(zip(word,category),columns=['word', 'category']) df_unique = df.drop_duplicates(subset=['word'])# subset=['word'] -> if we want to get the first category for each word lemmatizer = WordNetLemmatizer() aux = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence.lower())] aux_1 = pd.DataFrame(aux,columns=['word']) aux_2 = pd.DataFrame(aux_1.word.value_counts().index,columns=['word']) return pd.merge(df_unique, aux_2, on=['word'],how='inner', validate='1:1')
from nltk.corpus import nps_chat as chat from nltk import RegexpTagger from nltk import UnigramTagger from nltk import BigramTagger from nltk import TrigramTagger sizeB = len(brown.tagged_sents()) #length of size of brown corpus sizeC = len(chat.tagged_posts()) #length of size of NPS corpus brownTS = brown.tagged_sents() brownTW = brown.tagged_words( ) #partition sentences into a list with each word containing its tag chatTP = chat.tagged_posts( ) #partition words into a list with each post containing its tag chatTW = chat.tagged_words( ) #partition words into a list with each word containing its tag def splitSen(c, p): #function to partition corpus if c == "brown": t1 = brownTS[:int(sizeB * p)] t2 = brownTS[int(sizeB * p):] return t1, t2 if c == "chat": t1 = chatTP[:int(sizeC * p)] t2 = chatTP[int(sizeC * p):] return t1, t2 brownT50, brownT50 = splitSen("brown", 0.5) brownT90, brownT90 = splitSen("brown", 0.9)
# Return the POS of a rule (used for list sorting) def get_key(rule): return rule.split()[1] if __name__ == '__main__': # Get allowed words allowed_words_file = open('../../allowed_words.txt', 'r') allowed_words = allowed_words_file.read().split('\n') # Tagged words from corpora treebank_tagged_words = list(set(treebank.tagged_words())) conll2000_tagged_words = list(set(conll2000.tagged_words())) brown_tagged_words = list(set(brown.tagged_words())) nps_tagged_words = list(set(nps_chat.tagged_words())) vocab_rules = [] unvocabbed_words = [] # Find tags that occur with allowed words in the corpora for word in allowed_words: curr_tags = get_tags_linear(word, treebank_tagged_words) if not curr_tags: curr_tags = get_tags_linear(word, conll2000_tagged_words) if not curr_tags: curr_tags = get_tags_linear(word, brown_tagged_words) if not curr_tags:
loc = sent.find(spelling, current_loc) sent = sent[:loc + len(spelling)] + info + sent[loc + len(spelling):] current_loc = loc + len(spelling) + len(info) format_output['sentence'][row_idx] = sent print(sent, '--- Source: ', row['citation']) format_output.to_csv(filename, index=False, header=0) return ## Set up basic corpora pron_dict = cmudict.dict() brown_words = brown.tagged_words(tagset='universal') treebank_words = treebank.tagged_words(tagset='universal') nps_words = nps_chat.tagged_words(tagset='universal') corpus = brown_words + treebank_words + nps_words corpus = [(word.lower(), tag) for (word, tag) in corpus] stopset = set(stopwords.words('english')) ## Set up pretrained spaCy's word vector nlp = spacy.load('en_core_web_lg') ## Collect potential heteronyms data = get_het_from_corpus(corpus) ## Assign Wiktionary data to the potential heteronyms parser = init_wikparser() data = get_pronunciation(parser, data) fine_data = fine_graining(data) ## Create reference dictionary for heteronyms
size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9) size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5) train_sents_brown_09 = tagged_sents_brown[:size_brown_09] test_sents_brown_09 = tagged_sents_brown[size_brown_09:] train_sents_brown_05 = tagged_sents_brown[:size_brown_05] test_sents_brown_05 = tagged_sents_brown[size_brown_05:] train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09] test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:] train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05] test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:] # Task a) print("Task a)") tags_brown = [tag for word, tag in brown.tagged_words()] tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()] # Find most common tags max_brown = nltk.FreqDist(tags_brown).max() # NN max_nps_chat = nltk.FreqDist(tags_nps_chat).max() # UH # Create default taggers default_tagger_brown = nltk.DefaultTagger(max_brown) default_tagger_nps_chat = nltk.DefaultTagger(max_nps_chat) # Evaluate the Default taggers print("Accuracy Brown default tagger 90/10: ", default_tagger_brown.evaluate(test_sents_brown_09)) default_tagger_brown.tag(train_sents_brown_05) print("Accuracy Brown default tagger 50/50: ", default_tagger_brown.evaluate(test_sents_brown_05))
# init colorama colorama.init() # global constants CONST_tagset = 'universal' # global list of gold corpora # C:\Users\admin\AppData\Roaming\nltk_data\corpora\ corp_names = [ "brown", "nps_chat", "conll2000", "treebank", "twitter", "nhtsa_0", "nhtsa_1", "nhtsa_2", "nhtsa_3", "nhtsa_4", "nhtsa_5", "nhtsa_6" ] corp_words_tagged = [ brown.tagged_words(tagset=CONST_tagset), nps_chat.tagged_words(tagset=CONST_tagset), conll2000.tagged_words(tagset=CONST_tagset), treebank.tagged_words(tagset=CONST_tagset) ] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ]