Exemplo n.º 1
0
def createNames():
	from nltk.corpus import names as nm
	list_names = []
	for fileid in nm.fileids():
		list_names += nm.words(fileid)
	list_names.sort()
	list_names = [x.lower() for x in list_names]
	#print list_names[0:26]
	return list_names
Exemplo n.º 2
0
def load_data():
    """ 加载数据
    :return: list[(name, 0), (name, 1)...]
    """
    female_file, male_file = names.fileids()

    female = names.words(female_file)
    male = names.words(male_file)

    data_set = [(name.lower(), 0) for name in female] + [(name.lower(), 1) for name in male]
    random.shuffle(data_set)
    print('10 names:', data_set[:10])
    return data_set
Exemplo n.º 3
0
    content = [w for w in text if w.lower() not in stopwords_list]
    return len(content) / len(text) * 100


content_fraction(reuters.words())
#solving word puzzle
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
wordlist = words.words()
result = [
    w for w in wordlist
    if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters
]
print(result)
#find names common to both genders
print(names.fileids())
male_names = names.words('male.txt')
female_names = names.words('female.txt')
common_names = [w for w in male_names if w in female_names]
print(common_names)
#cfd against last letters for all names to check well known fact that names ending in letter a are almost always female
cfd = nltk.ConditionalFreqDist((fileid, name[-1])
                               for fileid in names.fileids()
                               for name in names.words(fileid))
cfd.plot()
#pronouncing dictionary for speech synthesizers - corpus cmu pronoucing dictionary
entries = cmudict.entries()
print(len(entries))
#for entry in entries: #can also use word,pronoun format
#    print(entry)
for word, pron in entries:
Exemplo n.º 4
0
import nltk
from nltk.corpus import names

last_letter_cfd = nltk.ConditionalFreqDist(
    (fileid, name[-1])
    for fileid in names.fileids()
    for name in names.words(fileid)
    )
vowels = 'aeiouy'
consonants = 'bcdfghjklmnpqrstvwxz'
singleProns = ['ph', 'th']

def find_letters(index, text, letters):
    i = index
    while i < len(text) and text[i] in letters:
        i = i + 1
    return i

def combindSeq(index, seq):
    if index >= len(seq):
        return []

    res = ''
    if seq[index][0] in consonants:
        res = seq[index]
        index = index + 1

    if index >= len(seq):
        return [res]

    res = res + seq[index]
Exemplo n.º 5
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
Exemplo n.º 6
0
import nltk
from nltk.corpus import names

cfd = nltk.ConditionalFreqDist((fileid, name[0]) for fileid in names.fileids()
                               for name in names.words(fileid))

cfd.plot()
Exemplo n.º 7
0
Created on Mon Jan 15 21:26:30 2018

@author: Mohnish_Devadiga
"""

import nltk
import matplotlib
import matplotlib.pyplot as plt
import random
from nltk.corpus import names
from PIL import Image   

#print(names.fileids())
matplotlib.style.use("ggplot")

names_cfd = nltk.ConditionalFreqDist((fileid,name[-2:])for fileid in names.fileids() for name in names.words(fileid))

'''
plt.figure(figsize=(50,10))
image = names_cfd.plot()
'''

def name_features(name):
    return{"pair" : name[-2:]}
    
print(name_features("katy"))

name_list = ([(name, 'male') for name in names.words('male.txt')] + [(name, "female") for name in names.words('female.txt')])

print(name_list[:10])
print(name_list[-10:])
Exemplo n.º 8
0
# ☼ Save some text into a file corpus.txt. Define a function load(f) that reads from the file named in its sole argument, and returns a string containing the text of the file.

# Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the various kinds of punctuation in this text. Use one multi-line regular expression, with inline comments, using the verbose flag (?x).
# Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the following kinds of expression: monetary amounts; dates; names of people and organizations.

# to do: monetary amounts

import nltk
from nltk.corpus import names

# loads an list full of names
options = names.fileids()
name_options = [names.words(f) for f in options]
# flattens the list
name_options = [item for sublist in name_options for item in sublist]

def load(f):
	"""Takes a file as its argument and returns a string containing the text of that file."""
	# opens the file and loads its text in.
	t = open(f)
	t = t.read()
	return t


def tokenize_punctuation(t):
	"""Tokenizes the punctuation in a text 't'."""
	pattern = r'''(?x)			# set to be verbose
	\W 						# searches for non-alphanumeric characters.
	'''
	matches = nltk.regexp_tokenize(t, pattern)
	return matches
Exemplo n.º 9
0
Arquivo: test.py Projeto: bmw9t/woolf
def cfd_generator():
	for fileid in names.fileids():
		for name in names.words(fileid):
			(fileid, name[-1]) 
Exemplo n.º 10
0
#Load packages
from nltk.corpus import names
import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy


#Feature extractor
def gender_features(word):
    return {'last_letter': word[-1]}


gender_features('Maria')

#Exploring female names
names.fileids()
names.words('female.txt')[:5]

#Building the classifier
labeled_names = ([(name, 'female') for name in names.words('female.txt')] +
                 [(name, 'male') for name in names.words('male.txt')])
labeled_names[:5]

random.shuffle(labeled_names)
labeled_names[:5]

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
featuresets[:5]

#Split data into training (80%) and test (20%) set
train_set_size = round(len(featuresets) * .8)
Exemplo n.º 11
0
from nltk.corpus import names
from nltk import ConditionalFreqDist as CondFreqDist

g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)])
n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)])
g2n.plot()
n2g.plot()

Exemplo n.º 12
0
# force floating point division
from __future__ import division

import nltk
import collections
from nltk.corpus import names  # for exercise 8
from nltk.corpus import brown  # for exercise 15
from nltk.corpus import reuters
from nltk.corpus import stopwords

## 1 - EXERCISES ##
print("----QUESTION 1----")
# 8 #
initials = nltk.ConditionalFreqDist((fileid, name[0]) for fileid in names.fileids() for name in names.words(fileid))
# male initials
print("Most common male first initials: " + str(initials["male.txt"].most_common()))
# female initials
print("\nMost common female first initials: " + str(initials["female.txt"].most_common()))

# 15 #
# I left this commented out because it takes a while to generate a
# frequency distribution for the entire Brown Corpus
##brown_freq = nltk.FreqDist(w.lower() for w in brown.words())
##brownlist = []
##for word in brown_freq:
##    if brown_freq[word] >= 3:
##        brownlist.append(word)
##print ("Number of words in Brown Corpus that occur at least three times: "
##       + str(len(brownlist)))

# 17 #
# Find the name both for woman and man from nltk.corpus.names

import nltk
from nltk.corpus import names

names = nltk.corpus.names

print names.fileids()

male_name = names.words('male.txt')
female_name = names.words('female.txt')

print [w for w in male_name if w in female_name]


# draw a FreqDist describe the relationship between sex and the last alpha.

# Usually end by 'a e i' is for female
# End by k o r s t is for male.

cfd = nltk.ConditionalFreqDist(
        (fileid,name[-1])
        for fileid in names.fileids()
        for name in names.words(fileid))

cfd.plot()
Exemplo n.º 14
0
from nltk.corpus import names
print(names.fileids())
print(len(names.words('female.txt')))
print(len(names.words('male.txt')))
Exemplo n.º 15
0
def is_name(word):
    for fileid in names.fileids():
        for name in names.words(fileid):
            if name == word:
                return True
    return False
Exemplo n.º 16
0
    for mer in syn.substance_meronyms():
        print("Synset '{2}':\n\t{0}\n\nsubstance meronym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
    for mer in syn.member_holonyms():
        print("Synset '{2}':\n\t{0}\n\nmember holonym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
    for mer in syn.part_holonyms():
        print("Synset '{2}':\n\t{0}\n\npart holonym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
    for mer in syn.substance_holonyms():
        print("Synset '{2}':\n\t{0}\n\nsubstance holonym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))

#8
from nltk.corpus import names
names.fileids()
tble=[(gender, first_letter) for gender in names.fileids() for first_letter in [w[0] for w in names.words(gender)]]
cfd=nltk.ConditionalFreqDist(tble)
cfd.plot()

#12

from nltk.corpus import cmudict
words=[a for a,b in cmudict.entries()]
len(set(words))-len(words)
i=0
m_words=words.copy()
for word in list(set(words)):
    print(i)
    i=i+1
    m_words.remove(word)
Exemplo n.º 17
0
def gender_initials_plot():
	"""shows a plot of the distribution of first name initials of males and females"""
	cfd = nltk.ConditionalFreqDist((gender, name[:1])
		for gender in names.fileids()
		for name in names.words(gender))
	return cfd.plot()
Exemplo n.º 18
0
# Below code is used to predict gender from list of values using Naive Bayes
# building the feature set base on last two letters of given name
# nltk provide list of male and female to train a model.
# P(A|B) = P(B|A) * P(A) / P(B)
# for example name given female ends with aeiouy and the naive Bayes formula look as below
# P(female|'[aeiouy]') = p('[aeiouy]'|female) P(female) / p('[aeiouy]')
import nltk
import random  # used for random selection in our model
from nltk.corpus import names  # importing male and female list from nltk
import matpoltlib.pypolt as plt
matplotlib.style.use('ggplot')
a = names.fileids()
print(a)

# getting conditional frequency distribution from names.fileids and getting last two letter from the names
name_cfd = nltk.conditionalFreqDist((fileid, name[-2:])
                                    for fileid in names.fileids()
                                    for name in names.words(fileid))

# now ploting the graph
plt.figure(figsize=(50, 10))
name_cfd.plot()


# build a function to get last two letters
def name_features(name):
    return {'pair': name[-2:]}


# calling function
Exemplo n.º 19
0
# Find the name both for woman and man from nltk.corpus.names

import nltk
from nltk.corpus import names

names = nltk.corpus.names

print names.fileids()

male_name = names.words('male.txt')
female_name = names.words('female.txt')

print[w for w in male_name if w in female_name]

# draw a FreqDist describe the relationship between sex and the last alpha.

# Usually end by 'a e i' is for female
# End by k o r s t is for male.

cfd = nltk.ConditionalFreqDist((fileid, name[-1])
                               for fileid in names.fileids()
                               for name in names.words(fileid))

cfd.plot()
Exemplo n.º 20
0
# 使用名字语料库
names = nltk.corpus.names
print(names.fileids())
male_names = names.words('male.txt')
print(male_names)
female_names = names.words('female.txt')
print(female_names)
print([w for w in male_names if w in female_names])
print(len(male_names))
print(len(female_names))
print(len(set(male_names).difference(female_names)))
print(set([1, 2, 3, 4]).difference(set([4, 5, 6, 7])))

from nltk.corpus import names

print(names.fileids())
male_names = names.words('male.txt')
female_names = names.words('female.txt')
print([w for w in male_names if w in female_names])

name_ends = ((fileid, name[-2:]) for fileid in names.fileids()
             for name in names.words(fileid))
for name_end in name_ends:
    print(name_end)
cfd = nltk.ConditionalFreqDist((fileid, name[-2:])
                               for fileid in names.fileids()
                               for name in names.words(fileid))
cfd.tabulate()
cfd.plot()  # 图2-7 显示男性与女性名字的结尾字母

# 4.2. 发音词典
Exemplo n.º 21
0
import nltk
nltk.corpus
from nltk.corpus import names

cfd = nltk.ConditionalFreqDist(
    (genre, name[0])
    for genre in names.fileids()
    for name in names.words(genre))
cfd.plot()
Exemplo n.º 22
0
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('TkAgg')
import nltk 
'''
◑ Define a conditional frequency distribution over the Names corpus
that allows you to see which initial letters are more frequent for males
vs. females (cf. 4.4).
'''

from nltk import ConditionalFreqDist
from nltk.corpus import names
pair = [(gender,word[0]) for gender in names.fileids() for word in names.words(gender)]
print pair
cfd = ConditionalFreqDist(pair)
cfd.plot()
Exemplo n.º 23
0
def get_data():
    female_file, male_file = names.fileids()
    female_names = names.words(female_file)
    male_names = names.words(male_file)
    dataset = [(name.lower(), 0) for name in female_names] + [(name.lower(), 1) for name in male_names]
    return dataset
Exemplo n.º 24
0
#!/usr/bin/env python

import sys
from zipfile import ZipFile

import nltk

# NAMES corpus
from nltk.corpus import names
NAME_SET = set()
for f in names.fileids():
    NAME_SET = NAME_SET.union(names.words(f))

# wget http://download.geonames.org/export/dump/cities15000.zip
GEONAMES_FILE = 'cities15000.zip'

# Prepare geonames
CITIES = set()
with ZipFile(GEONAMES_FILE) as zip_file:
    for filename in zip_file.namelist():
        contents = zip_file.open(filename)
        for line in contents:
            geonameid, name, asciiname, alternatenames, other = line.split('\t', 4)
            other = other.split('\t')
            population = int(other[-5])
            if population < 100000:
                continue
            CITIES.add(tuple(name.split()))
            CITIES.add(tuple(asciiname.split()))
            for name in alternatenames.split(','):
                CITIES.add(tuple(name.split()))
Exemplo n.º 25
0
def is_name(word):
        for fileid in names.fileids():
                for name in names.words(fileid):
                        if name == word:
                                return True
        return False
Exemplo n.º 26
0
# Define a conditional frequency distribution over the Names Corpus that allows you to see which initial letters are
# more frequent for males versus females

import nltk
from nltk.corpus import names

name_fileids = names.fileids()
print(name_fileids)
print(names.words(name_fileids[1]))

cfd = nltk.ConditionalFreqDist(
    (fileid, word[0])
    for fileid in names.fileids()
    for word in names.words(fileid)
)

cfd.plot()
cfd.tabulate()
Exemplo n.º 27
0
########## WORDLIST CORPUS READER ###############

#Basic Corpus Reader
from nltk.corpus.reader import WordListCorpusReader
#List of a few thousand names organized by gender
from nltk.corpus import names
#List of english words
from nltk.corpus import words

nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
#nltkFile="mywords.txt"
#source=nltkDir+nltkFile

### One File WordListCorpusReader
reader=WordListCorpusReader(nltkDir,['wordlist.txt'])
print reader.words()
print reader.fileids()

### MultiFile WordListCorpusReader
#To get the names of the files in the corpus use the "fileids" command
names.fileids()
print len(names.words('female.txt'))
print len(names.words('female.txt'))

words.fileids()
print len(words.words('en-basic'))
print len(words.words('en'))

###Chunked Corpus Reader