示例#1
1
from nltk.corpus.reader import WordListCorpusReader
import nltk

# print(nltk.data.find('corpora/cookbook'))
# print(nltk.data.find('corpora/cookbook/wordlist.txt'))

d = nltk.data.find('corpora/cookbook')
reader = WordListCorpusReader(d, ['wordlist.txt'])
print(reader.words())
print(reader.fileids())
示例#2
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
示例#3
0
    def find_info_type(self):
        type_list = []  #list of all types('abstract','speaker')
        content_list = []  #list with content
        reader = WordListCorpusReader(self.folder_name, [self.file_name])
        all_words = reader.words()

        #is the mail a proper one?
        if (all_words == []):
            return ([], [])

        #append the first tag of the mail ex:<0.1....>
        type_list.append("")
        content_list.append(all_words[0])

        for w in all_words[1:]:
            #search for pattern like "Abstract: ..."
            type = re.search('^(\w+)(:)', w)

            #using group functionality to split the topic and content
            if (type != None):
                type_list.append(type.group(1))
                content = re.search('^(\w+:)(.*)', w)
                content_list.append(content.group(2))

            #not the best way to add the \n splitted content but...
            elif (len(content_list) > 0):
                last_element = content_list[-1]
                extra_content = w
                last_element = last_element + "\n" + extra_content
                content_list[-1] = last_element

        #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content
        return (type_list, content_list)
示例#4
0
 def __init__(self, punctuation_marks: str, corpus_dir: str,
              corpus_files: list):
     reader = WordListCorpusReader(corpus_dir, corpus_files)
     self.vi_dict = set(reader.words())
     # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả
     self.vi_dict.update(list(punctuation_marks))
     # Thêm một số từ đặc biệt
     self.vi_dict.update(
         ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't'])
     self.re_d = re.compile(r'\d')
示例#5
0
    def __init__(self, config_file):

        try:

            self.config = ConfigParser.RawConfigParser()
            self.config.optionxform = str
            self.config.read(config_file)

            tokenizers = self.config.get('post_training_corpus', 'regex_file')
            self.config_tokenizer = json.load(open(tokenizers, "r"))

            self.isWordList = self.config.getboolean('postaggers',
                                                     'isWordList')
            self.wordlist = self.config.items('postaggers.wordlist')
            self.training_portion = self.config.getfloat(
                'post_training_corpus', 'training_portion')
            self.taggers_path = self.config.get('postaggers', 'save_to')
            self.max_ngrams = self.config.getint('postaggers', 'max_ngrams')
            self.tagger_extension_file = self.config.get(
                'postaggers', 'ext_file')
            corpus = []

            for key, corpus_file in self.config.items(
                    'post_training_corpus.corpus'):
                print "Generate model from file:", corpus_file
                corpus.append(corpus_file)

            self.corpusReader = ConllChunkCorpusReader(
                self.config.get('post_training_corpus', 'corpora'), corpus,
                ('NP', 'PP', 'VP', 'AP'))
            self.corpusSents = self.corpusReader.tagged_sents()

            self.wordListReader = WordListCorpusReader(
                self.config.get('post_training_corpus', 'wordlist_path'),
                r'.*\.txt')

            self.regex_list = []

            for key in self.config_tokenizer.keys():

                if self.config_tokenizer[key]['isolate'] == "True":
                    regex = self.config_tokenizer[key]['regex'].encode(
                        'utf-8').decode('utf-8')
                    post = self.config_tokenizer[key]['post']
                    self.regex_list.append((regex, post))

            #logging.info(self.regex_list)

        except Exception, e:

            print "Error :", str(e)
            pdb.set_trace()
示例#6
0
def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails
示例#7
0
def addingCorpus():
    path = os.path.expanduser('~/nltk_data')
    if not os.path.exists(path):
        os.mkdir(path)
    print(os.path.exists(path))
    print(nltk.data.path)
    print(path in nltk.data.path)

    nltk.data.load('corpora/cookbook/cookbook.txt', format='raw')

    reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/',
                                  ['wordlist.txt'])

    print(reader.words())
                     user='******',
                     passwd='Webrowse@123',
                     db='article')
cur = db.cursor()
'''

dataset = load_files(
    '/home/soumen/projects/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs'
)  # Read an article

file_id_argv = open(sys.argv[1])
file_id = file_id_argv.read()
file_list = file_id.split('\n')
file_list.pop(-1)

italian_stopwords = WordListCorpusReader('.', ['stop-words-it-en.txt'])


def language_detection(text):
    """Description here"""
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset.data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    X_test_counts = count_vect.transform(text)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    clf = MultinomialNB().fit(X_train_tfidf, dataset.target)

    predicted = clf.predict(X_test_tfidf)
示例#9
0
    n_score = ret.prob("negative")

    if max(p_score, n_score) <= cutoff:
        return "neutral"

    if p_score > n_score:
        return "positive"

    elif n_score > p_score:
        return "negative"

    else:
        return "neutral"


reader = WordListCorpusReader('/path/to/sentiment/files',
                              ['positive.txt', 'negative.txt'])

pos_feats = [(dict([(word, True)]), 'positive')
             for word in reader.words('positive.txt')]
neg_feats = [(dict([(word, True)]), 'negative')
             for word in reader.words('negative.txt')]
train_feats = pos_feats + neg_feats
classifier = NaiveBayesClassifier.train(train_feats)

t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY"))

connection = pymongo.Connection()
db = connection.twitter
mentions = db.mentions

screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"]
示例#10
0
import nltk
from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader('', ['computerscience.txt'])
words = [nltk.word_tokenize(i) for i in reader.words()]
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmer = WordNetLemmatizer()

stemmed = [[stemmer.stem(y) for y in i] for i in words]
lemmed = [[lemmer.lemmatize(y) for y in i] for i in words]

print(stemmed)
示例#11
0
 def __init__(self):
     '''
     Constructor for the BE06 word list corpa. 
     
     @note: Initaly the contructor tries to load the corpora from a .plk file. If this has not
     created, then a new instance is created by iterating though all files for BE06.
     '''
     
     try:
         #Attempt to open .plk file and load. 
         input = open("./Corpus/BE06/BE06.pkl", 'rb')
         reader = load(input)
         input.close()
     except IOError as e:
         filelist = []
         words = []
         
         #Find all .txt files in /BE06 dirctory
         for files in os.listdir("./Corpus/BE06"):
             if files.endswith(".txt"):
                 filelist.append(files)
         
         if(len(filelist) == 500):
             #Iterate through whole list of file
             for name in filelist:
                 f = open("./Corpus/BE06/" + name)
             
                 lines = f.readlines()
                 
                 #Read line in file, tokonize to words, and remove all 
                 #Punctuation
                 for line in lines:
                     tmp1 = nltk.sent_tokenize(line)
                     for lin in tmp1:
                         tmp = nltk.word_tokenize(lin)
                         for word in tmp:
                             for c in string.punctuation:
                                 word = word.replace(c, "")
                             words.append(word)
                         
                 f.close()
             
             #Write wordlist to output file.
             a = open("./Corpus/BE06/finalcorpa.txt", "wb") 
             for word in words:
                 if word not in ".,;!?\"":
                     a.write(word + '\n')   
                     
             a.close()    
             
             #Creat NLTK corpus, and save a copy in folder for later use
             reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt'])
             output = open("./Corpus/BE06/BE06.pkl", 'wb')
             dump(reader, output, -1)
             output.close()
         else:
             reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt'])
             output = open("./Corpus/BE06/BE06.pkl", 'wb')
             dump(reader, output, -1)
             output.close()
     
     #Return corpus
     self.corpa = reader
示例#12
0
from nltk.corpus import brown
from nltk.corpus.reader import WordListCorpusReader
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import re
from os import listdir
from os.path import isfile, join

wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training"
untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged"
general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data"

l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words()

file_names = [
    f for f in listdir(untagged_data_filepath)
    if isfile(join(untagged_data_filepath, f))
]
file_names = file_names[1:]

reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]])

corpus = reader.raw()
words = reader.words()


def get_tags_by_name(corpus, name):
    return re.findall(r"<" + name + r">.+</" + name + r">", corpus)
示例#13
0
# Create a corpus reader with all the files
reader = PlaintextCorpusReader('.', files)

# Set up a translation table for punctuation to the empty string
table = str.maketrans('', '', string.punctuation)

# Get a list of English stopwords without punctuation
english_stops = set(stopwords.words('english'))
english_stops_nopunct = {
    stopword.translate(table)
    for stopword in english_stops
}

# Load the insect wordlist of stems
insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt'])

# A list to hold the frequency data
freq_data = []

count = 1
# Read each file in turn
for file in files:
    text = reader.raw(file)

    print(f'{count}: TOKENISING {file}')

    # Tokenise and normalise to lowercase
    tokens = word_tokenize(text.lower())

    # Remove all punctuation marks
示例#14
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
示例#15
0
    f for f in listdir(corpora + '/golden_test_subset_a')
    if isfile(join(corpora + '/golden_test_subset_a', f))
]
onlyfilessbsa2 = [
    f for f in listdir(corpora + '/golden_tagged_subset_a')
    if isfile(join(corpora + '/golden_tagged_subset_a', f))
]
testc = nltk.corpus.reader.plaintext.PlaintextCorpusReader(
    corpora + '/golden_test_subset_a', onlyfilessbsa1)
tagdc = nltk.corpus.reader.plaintext.PlaintextCorpusReader(
    corpora + '/golden_tagged_subset_a', onlyfilessbsa2)

# getting named entity corpora

names = WordListCorpusReader(
    nepath,
    ['male.txt', 'female.txt', 'family.txt'])  # list of names, from canvas
titles = WordListCorpusReader(nepath, ['titles.txt'])  # list of common titles
orgsuffs = WordListCorpusReader(
    nepath, ['orgsuff.txt'])  # list of organisation suffixes
daymonths = WordListCorpusReader(nepath,
                                 ['daymonths.txt'])  # list of days and months

# extracting named entities from tagged data
# regex patterns to match each tag
pattern1 = '<ENAMEX TYPE="PERSON">(.*?)<\/ENAMEX>'
pattern2 = '<ENAMEX TYPE="LOCATION">(.*?)<\/ENAMEX>'
pattern3 = '<ENAMEX TYPE="ORGANIZATION">(.*?)<\/ENAMEX>'

# finding every example in the data, storing in sets
people = set(re.findall(pattern1, trainingcorpus.raw()))
import os
import re
import sys
import json
import nltk.test
import abbreviations
import portuguese_tagger_processor
from sentilex import sentiLexPairRdd
from nltk.corpus.reader import WordListCorpusReader

__output_path = "result.json"

stopwords = nltk.corpus.stopwords.words('portuguese')
reader = WordListCorpusReader('.', ['symbols.txt'])
symbols = reader.words()
reader = WordListCorpusReader('.', ['positive_emoticons.txt'])
positive_emoticons = reader.words()
reader = WordListCorpusReader('.', ['negative_emoticons.txt'])
negative_emoticons = reader.words()

tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer()
tagger = portuguese_tagger_processor.get_tagger()
json_result = []
tweet_dict = {}


def count_positive_emoticons(tokens):
    counter = 0
    for emoticon in positive_emoticons:
        if emoticon in tokens:
            counter += 1
示例#17
0
文件: parse.py 项目: MMJ744/NLP
import nltk
from nltk import load_parser

from nltk.corpus.reader import WordListCorpusReader

reader = WordListCorpusReader('', ['words.txt'])
words = [nltk.word_tokenize(i) for i in reader.words()]
cp = load_parser('grammar.fcfg', trace=1)

# from nltk.corpus import treebank
# from nltk.tag import DefaultTagger
# train_set = treebank.tagged_sents()[:4000]
# test_set = treebank.tagged_sents()[2000:]
# from nltk.tag import UnigramTagger
# unigramTagger = UnigramTagger(train_set)
# from nltk.tag import BigramTagger, TrigramTagger
# bigramTagger = BigramTagger(train_set, cutoff=2)
# trigramTagger = TrigramTagger(train_set, cutoff=3)
# def backoff_tagger(train_sents, tagger_classes, backoff=None):
#    for cls in tagger_classes :
#        backoff = cls(train_sents, backoff=backoff)
#    return backoff
# tagger = backoff_tagger(train_set, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))
# for sentence in words:
#    print(tagger.tag(sentence))


for sentence in words:
    print(sentence)
    for tree in cp.parse(sentence):
        print(tree)
示例#18
0
#!/usr/bin/env python
# encoding: utf-8
"""
wordNet.py
Created by Aaron Erlich on 2013-02-13.
"""

import sys
import os
import nltk
from nltk.corpus.reader import WordListCorpusReader

path = #insert your path
#path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords"
reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list

crime = reader.words()
crime = [word.lower().strip() for word in crime]

from nltk.corpus import wordnet

#lemmas are the distinct meaning of the a word and all of each meaning's possible morphologies
#we see that lots of the student's words have both noun and verb meanings. Which does he care about?
#these words are polysemous -- they have similar but different meanings
for word in crime:
	print word
	print wordnet.synsets(word)
	print "\n"
	raw_input("Hit Enter")
	
[synset.lemma_names for synset in wordnet.synsets("stealing")]
示例#19
0
def open_places_wordlist():
    path = '/Users/tim/mycode/time/wordlists/'
    wordlist = 'ga_gazetteer_wordlist.txt'
    reader = WordListCorpusReader(path, [wordlist])
    return reader