示例#1
1
from nltk.corpus.reader import WordListCorpusReader
import nltk

# print(nltk.data.find('corpora/cookbook'))
# print(nltk.data.find('corpora/cookbook/wordlist.txt'))

d = nltk.data.find('corpora/cookbook')
reader = WordListCorpusReader(d, ['wordlist.txt'])
print(reader.words())
print(reader.fileids())
示例#2
0
    def find_info_type(self):
        type_list = []  #list of all types('abstract','speaker')
        content_list = []  #list with content
        reader = WordListCorpusReader(self.folder_name, [self.file_name])
        all_words = reader.words()

        #is the mail a proper one?
        if (all_words == []):
            return ([], [])

        #append the first tag of the mail ex:<0.1....>
        type_list.append("")
        content_list.append(all_words[0])

        for w in all_words[1:]:
            #search for pattern like "Abstract: ..."
            type = re.search('^(\w+)(:)', w)

            #using group functionality to split the topic and content
            if (type != None):
                type_list.append(type.group(1))
                content = re.search('^(\w+:)(.*)', w)
                content_list.append(content.group(2))

            #not the best way to add the \n splitted content but...
            elif (len(content_list) > 0):
                last_element = content_list[-1]
                extra_content = w
                last_element = last_element + "\n" + extra_content
                content_list[-1] = last_element

        #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content
        return (type_list, content_list)
示例#3
0
 def __init__(self, punctuation_marks: str, corpus_dir: str,
              corpus_files: list):
     reader = WordListCorpusReader(corpus_dir, corpus_files)
     self.vi_dict = set(reader.words())
     # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả
     self.vi_dict.update(list(punctuation_marks))
     # Thêm một số từ đặc biệt
     self.vi_dict.update(
         ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't'])
     self.re_d = re.compile(r'\d')
示例#4
0
def addingCorpus():
    path = os.path.expanduser('~/nltk_data')
    if not os.path.exists(path):
        os.mkdir(path)
    print(os.path.exists(path))
    print(nltk.data.path)
    print(path in nltk.data.path)

    nltk.data.load('corpora/cookbook/cookbook.txt', format='raw')

    reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/',
                                  ['wordlist.txt'])

    print(reader.words())
示例#5
0
def tokenize_file(file, corpus_root, english_stops):            #tokenize input file, count words, characters, remove stopwords
    tokenizer = RegexpTokenizer(r'\w+')
    item_count = 0
    total_chars = 0
    word_count = 0
    wordlist = []

    reader = WordListCorpusReader(corpus_root, file)
    chunks = reader.words()

    for item in chunks:
        total_chars += len(chunks[item_count])
        word_tokens = tokenizer.tokenize(chunks[item_count])
        word_count += len(word_tokens)
        item_count += 1
        for word in word_tokens:
            wordlist.append(word)
    stopsout = [word for word in wordlist if word.lower() not in english_stops]
    return wordlist, stopsout, word_count, total_chars
示例#6
0
tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training"
untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged"
general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data"

l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words()

file_names = [
    f for f in listdir(untagged_data_filepath)
    if isfile(join(untagged_data_filepath, f))
]
file_names = file_names[1:]

reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]])

corpus = reader.raw()
words = reader.words()


def get_tags_by_name(corpus, name):
    return re.findall(r"<" + name + r">.+</" + name + r">", corpus)


def tokenise(corpus):
    return re.findall("([^\s<>]+)[\s\n<>]", corpus)


def get_name_of_poster(corpus):
    return re.findall()


def names_in_file(corpus):
# from nltk.stem import WordNetLemmatizer
# wnl = WordNetLemmatizer()
# print(wnl.lemmatize('monsters'))
'''
In each of the above cases we have handled one word. 
Now print the stemmed and lemmatized versions of all the words in the document computerscience.txt
Preview the document. Here is an overview of what you need to do: 
    1. Load the file into a reader [ Hint: reader = WordListCorpusReader( ... ) ]
    2. use word_tokenize from nltk.tokenize to convert the text into words
    3. Loop through the text [Hint: Use the for statement]
    4. Lemmatize and Stem each word.
    5. Look at the difference between the two, notice how the lemmatizer makes mistakes in some cases - can you identify why and propose a solution?
'''

from nltk.corpus.reader import WordListCorpusReader
tokens = []
reader = WordListCorpusReader('./', ['computerscience.txt'])
for count, ele in enumerate(reader.words()):
    print(count, "\b:", ele, "\n")
    tokens += nltk.word_tokenize(ele)

print(tokens)
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for token in tokens:
    print(token)
    print(wnl.lemmatize(token))
    print(stemmer.stem(token))
示例#8
0
文件: __init__.py 项目: wjt/fewerror
    # 35. Possessive wh-pronoun
    WP_ = 'WP$'

    # 36. Wh-adverb
    WRB = 'WRB'

    @staticmethod
    def nounish(word, pos):
        # nltk apparently defaults to 'NN' for smileys :) so special-case those
        return pos in (POS.NN, POS.NNS, POS.NNP, POS.NNPS) and \
            any(c.isalpha() for c in word)


mass_noun_corpora = WordListCorpusReader('wordlist/massnoun', r'[a-z]+')
mass_nouns = mass_noun_corpora.words()

QUANTITY_POS_TAGS = frozenset((
    POS.JJ,
    POS.VBN,
    POS.VBP,
    POS.NN,
    POS.NNP,
    POS.RB,
    POS.RBR,
    POS.RBS,
))

bad_words_corpora = WordListCorpusReader('wordlist/shutterstock-bad-words', r'[a-z]{2,3}')
bad_words_en = bad_words_corpora.words('en')
示例#9
0
for l in locs:
    file.write(l + "\n")
file.close()

file = open(nepath + "\\orgs.txt", "w")
for o in orgs:
    file.write(o + "\n")
file.close()

# extracting the new data into corpora
fullnames = WordListCorpusReader(nepath, ['names.txt'])
locs = WordListCorpusReader(nepath, ['locs.txt'])
orgs = WordListCorpusReader(nepath, ['orgs.txt'])

# get the words from the named entity corpora created earlier
names = names.words()
titles = titles.words()
fullnames = fullnames.words()
locations = locs.words()
organisations = orgs.words()
orgsuffs = orgsuffs.words()
daymonths = daymonths.words()


# extracting named entities (proper nouns)
# if a proper noun is found, add it to the list
# adjacent proper nouns are joined together
def findPropers(words):
    propers = []
    last = False
示例#10
0
    words = [word for word in tokens_nopunct if word.isalpha()]

    # Remove stopwords from the tokens
    words_nostops = [
        word for word in words if word not in english_stops_nopunct
    ]

    # Stem the words
    porter = PorterStemmer()
    stems = [porter.stem(word) for word in words_nostops]

    # Create a frequency distribution from the samples (words)
    freqdist = FreqDist(stems)

    # Create a dict with the frequency of the insect words only
    insect_freq = {word: freqdist.freq(word) for word in insect_words.words()}

    # Add the year from the file name to the dict
    year = re.findall(r'\d{4}', file)
    insect_freq['year'] = year[0]

    # Add the results from this file to the total results
    freq_data.append(insect_freq)

    count += 1

# Create a Pandas DataFrame
df = pd.DataFrame(freq_data)

print('PLOTTING...')
示例#11
0
x = nltk.data.load('big.txt', format='auto')
'''
reader = WordListCorpusReader('',['wordlist.txt','wordlist2.txt'])
print(reader.words())
print(reader.fileids())

stemmer = PorterStemmer()
print(stemmer.stem('running'))

wnl = WordNetLemmatizer()
print(wnl.lemmatize('dogs'))

'''

csReader = WordListCorpusReader('','computerscience.txt')
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

### Concats list of words from reader
csWords = nltk.word_tokenize(' '.join(csReader.words()))
print(type(csWords))
for word in csWords:
    print("%s,%s" % (wnl.lemmatize(word), stemmer.stem(word)))
    
inputList = ['16/12/2016']
    
for inputString in inputList:
    print(re.findall(r'(.*?)[\s\-\\](.*?)[\s\-\\](.*?)', inputString))
    
示例#12
0
    if p_score > n_score:
        return "positive"

    elif n_score > p_score:
        return "negative"

    else:
        return "neutral"


reader = WordListCorpusReader('/path/to/sentiment/files',
                              ['positive.txt', 'negative.txt'])

pos_feats = [(dict([(word, True)]), 'positive')
             for word in reader.words('positive.txt')]
neg_feats = [(dict([(word, True)]), 'negative')
             for word in reader.words('negative.txt')]
train_feats = pos_feats + neg_feats
classifier = NaiveBayesClassifier.train(train_feats)

t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY"))

connection = pymongo.Connection()
db = connection.twitter
mentions = db.mentions

screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"]

re_RT = re.compile(("(RT\s?@YOUR_ACCOUNT|"
                    "RT\s?@YOUR_OTHER_ACCOUNT)"), re.UNICODE | re.IGNORECASE)
示例#13
0
		docWords = corpus.words(fileName)
		for word in docWords:
			#print(word)
			w = word.lower()
			if w in wordSet: 
				# i could also use in the fd.inc approach here and it's proabably better-just showing another option. 
				print(w + " is in " + fileName)  
				counter+= 1
	 	billCounts.append(counter)
	return billCounts

from nltk.corpus.reader import WordListCorpusReader
path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords"
reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list

crime = reader.words()
crime = [word.lower().strip() for word in crime]	

crimeSet = set([w.lower() for w in crime])
crimeCount = make_count(billsCorpora, crimeSet)

fd = count_stems(billsCorpora)

counter = 0
#lets look at 200 of the most popular items and there counts

#you could use the csv writer methods or this which is kind of hacky
mywordlist = numpy.asarray([billsCorpora.fileids(), crimeCount])
mywordlist[0][1] #name
mywordlist[1][1] #count
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path): os.mkdir(path)
os.path.exists(path)
import nltk.data
#path in nltk.data.path
print path
''' note that this should be a path in the Git_Workspace on D:\ '''

''' load a sample wordlist '''
#import nltk.data
nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw')
'nltk\n'

from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt'])
reader.words()

''' reading a tagged corpus '''
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos')
reader.words()
reader.tagged_words()
reader.sents()
reader.tagged_sents()
reader.paras()
reader.tagged_paras()

''' different Tokenizer - works? '''
from nltk.tokenize import SpaceTokenizer
reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer())
reader.words()
示例#15
0
# Initialize constants
NLTK_HOME = '/home/administrator/nltk_data'

l_list = []
# cleaning, tokenizing, normalizing

# Read the Corpus
state_reader = WordListCorpusReader(NLTK_HOME, ['state_files.txt'])
city_reader = WordListCorpusReader(NLTK_HOME, ['city_files.txt'])
train_file = '/app/ai/train_file.txt'
test_results_file = '/app/ai/test_city_results_file.txt'


# Store the URLs in  a list
urls = ([(url,'city') for url in city_reader.words()]+
        [(url,'state') for url in state_reader.words()]
        )

for url in list(urls):
    # Remove HTMLtabs after reading the URL
    raw = nltk.clean_html(urlopen(url[0]).read())
    print 'Finished cleaning html for ', url[0]
    # Compute the frequency distribution of the words
    tokens=nltk.FreqDist(word_normalizer(word.lower() for word in wordpunct_tokenize(raw)))
    print 'Finished computing FD for ', url[0]
    l_list = l_list + [(geo_features(word),url[1]) for word in tokens.keys()[:10]]
    print 'Finished extracting feature for ', url[0]

with open(train_file, 'w') as f:
    pickle.dump(l_list, f)
示例#16
0
import nltk
from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader('', ['computerscience.txt'])
words = [nltk.word_tokenize(i) for i in reader.words()]
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmer = WordNetLemmatizer()

stemmed = [[stemmer.stem(y) for y in i] for i in words]
lemmed = [[lemmer.lemmatize(y) for y in i] for i in words]

print(stemmed)
import os
import re
import sys
import json
import nltk.test
import abbreviations
import portuguese_tagger_processor
from sentilex import sentiLexPairRdd
from nltk.corpus.reader import WordListCorpusReader

__output_path = "result.json"

stopwords = nltk.corpus.stopwords.words('portuguese')
reader = WordListCorpusReader('.', ['symbols.txt'])
symbols = reader.words()
reader = WordListCorpusReader('.', ['positive_emoticons.txt'])
positive_emoticons = reader.words()
reader = WordListCorpusReader('.', ['negative_emoticons.txt'])
negative_emoticons = reader.words()

tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer()
tagger = portuguese_tagger_processor.get_tagger()
json_result = []
tweet_dict = {}


def count_positive_emoticons(tokens):
    counter = 0
    for emoticon in positive_emoticons:
        if emoticon in tokens:
            counter += 1
示例#18
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
示例#19
0
########## WORDLIST CORPUS READER ###############

#Basic Corpus Reader
from nltk.corpus.reader import WordListCorpusReader
#List of a few thousand names organized by gender
from nltk.corpus import names
#List of english words
from nltk.corpus import words

nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
#nltkFile="mywords.txt"
#source=nltkDir+nltkFile

### One File WordListCorpusReader
reader=WordListCorpusReader(nltkDir,['wordlist.txt'])
print reader.words()
print reader.fileids()

### MultiFile WordListCorpusReader
#To get the names of the files in the corpus use the "fileids" command
names.fileids()
print len(names.words('female.txt'))
print len(names.words('female.txt'))

words.fileids()
print len(words.words('en-basic'))
print len(words.words('en'))

###Chunked Corpus Reader