from nltk.corpus.reader import WordListCorpusReader import nltk # print(nltk.data.find('corpora/cookbook')) # print(nltk.data.find('corpora/cookbook/wordlist.txt')) d = nltk.data.find('corpora/cookbook') reader = WordListCorpusReader(d, ['wordlist.txt']) print(reader.words()) print(reader.fileids())
def read_emails(self, path): # Get all files files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('DS_Store')]) except: pass reader = WordListCorpusReader(path, files) cleaner = Cleaner() emails = list() # Creates the Email Object out of each email file and appends to list for file_id in reader.fileids(): with open(path + file_id, 'r') as current_file: cleaned_contents = cleaner.clean_file(current_file.read()) split_email_header, split_email_body, split_email_file_id = self.divide( cleaned_contents, file_id) emails.append( Email(split_email_header, split_email_body, split_email_file_id)) # Return list of Email objects return emails
def find_info_type(self): type_list = [] #list of all types('abstract','speaker') content_list = [] #list with content reader = WordListCorpusReader(self.folder_name, [self.file_name]) all_words = reader.words() #is the mail a proper one? if (all_words == []): return ([], []) #append the first tag of the mail ex:<0.1....> type_list.append("") content_list.append(all_words[0]) for w in all_words[1:]: #search for pattern like "Abstract: ..." type = re.search('^(\w+)(:)', w) #using group functionality to split the topic and content if (type != None): type_list.append(type.group(1)) content = re.search('^(\w+:)(.*)', w) content_list.append(content.group(2)) #not the best way to add the \n splitted content but... elif (len(content_list) > 0): last_element = content_list[-1] extra_content = w last_element = last_element + "\n" + extra_content content_list[-1] = last_element #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content return (type_list, content_list)
def __init__(self, punctuation_marks: str, corpus_dir: str, corpus_files: list): reader = WordListCorpusReader(corpus_dir, corpus_files) self.vi_dict = set(reader.words()) # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả self.vi_dict.update(list(punctuation_marks)) # Thêm một số từ đặc biệt self.vi_dict.update( ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't']) self.re_d = re.compile(r'\d')
def __init__(self, config_file): try: self.config = ConfigParser.RawConfigParser() self.config.optionxform = str self.config.read(config_file) tokenizers = self.config.get('post_training_corpus', 'regex_file') self.config_tokenizer = json.load(open(tokenizers, "r")) self.isWordList = self.config.getboolean('postaggers', 'isWordList') self.wordlist = self.config.items('postaggers.wordlist') self.training_portion = self.config.getfloat( 'post_training_corpus', 'training_portion') self.taggers_path = self.config.get('postaggers', 'save_to') self.max_ngrams = self.config.getint('postaggers', 'max_ngrams') self.tagger_extension_file = self.config.get( 'postaggers', 'ext_file') corpus = [] for key, corpus_file in self.config.items( 'post_training_corpus.corpus'): print "Generate model from file:", corpus_file corpus.append(corpus_file) self.corpusReader = ConllChunkCorpusReader( self.config.get('post_training_corpus', 'corpora'), corpus, ('NP', 'PP', 'VP', 'AP')) self.corpusSents = self.corpusReader.tagged_sents() self.wordListReader = WordListCorpusReader( self.config.get('post_training_corpus', 'wordlist_path'), r'.*\.txt') self.regex_list = [] for key in self.config_tokenizer.keys(): if self.config_tokenizer[key]['isolate'] == "True": regex = self.config_tokenizer[key]['regex'].encode( 'utf-8').decode('utf-8') post = self.config_tokenizer[key]['post'] self.regex_list.append((regex, post)) #logging.info(self.regex_list) except Exception, e: print "Error :", str(e) pdb.set_trace()
def read_emails(path): files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('.DS_Store')]) except: pass reader = WordListCorpusReader(path, files) text = clean(reader.raw()) emails = split_emails(text, reader.fileids()) return emails
def addingCorpus(): path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) print(os.path.exists(path)) print(nltk.data.path) print(path in nltk.data.path) nltk.data.load('corpora/cookbook/cookbook.txt', format='raw') reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/', ['wordlist.txt']) print(reader.words())
user='******', passwd='Webrowse@123', db='article') cur = db.cursor() ''' dataset = load_files( '/home/soumen/projects/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs' ) # Read an article file_id_argv = open(sys.argv[1]) file_id = file_id_argv.read() file_list = file_id.split('\n') file_list.pop(-1) italian_stopwords = WordListCorpusReader('.', ['stop-words-it-en.txt']) def language_detection(text): """Description here""" count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(dataset.data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(text) X_test_tfidf = tfidf_transformer.transform(X_test_counts) clf = MultinomialNB().fit(X_train_tfidf, dataset.target) predicted = clf.predict(X_test_tfidf)
n_score = ret.prob("negative") if max(p_score, n_score) <= cutoff: return "neutral" if p_score > n_score: return "positive" elif n_score > p_score: return "negative" else: return "neutral" reader = WordListCorpusReader('/path/to/sentiment/files', ['positive.txt', 'negative.txt']) pos_feats = [(dict([(word, True)]), 'positive') for word in reader.words('positive.txt')] neg_feats = [(dict([(word, True)]), 'negative') for word in reader.words('negative.txt')] train_feats = pos_feats + neg_feats classifier = NaiveBayesClassifier.train(train_feats) t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY")) connection = pymongo.Connection() db = connection.twitter mentions = db.mentions screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"]
import nltk from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader('', ['computerscience.txt']) words = [nltk.word_tokenize(i) for i in reader.words()] from nltk.stem.porter import * stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer lemmer = WordNetLemmatizer() stemmed = [[stemmer.stem(y) for y in i] for i in words] lemmed = [[lemmer.lemmatize(y) for y in i] for i in words] print(stemmed)
def __init__(self): ''' Constructor for the BE06 word list corpa. @note: Initaly the contructor tries to load the corpora from a .plk file. If this has not created, then a new instance is created by iterating though all files for BE06. ''' try: #Attempt to open .plk file and load. input = open("./Corpus/BE06/BE06.pkl", 'rb') reader = load(input) input.close() except IOError as e: filelist = [] words = [] #Find all .txt files in /BE06 dirctory for files in os.listdir("./Corpus/BE06"): if files.endswith(".txt"): filelist.append(files) if(len(filelist) == 500): #Iterate through whole list of file for name in filelist: f = open("./Corpus/BE06/" + name) lines = f.readlines() #Read line in file, tokonize to words, and remove all #Punctuation for line in lines: tmp1 = nltk.sent_tokenize(line) for lin in tmp1: tmp = nltk.word_tokenize(lin) for word in tmp: for c in string.punctuation: word = word.replace(c, "") words.append(word) f.close() #Write wordlist to output file. a = open("./Corpus/BE06/finalcorpa.txt", "wb") for word in words: if word not in ".,;!?\"": a.write(word + '\n') a.close() #Creat NLTK corpus, and save a copy in folder for later use reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt']) output = open("./Corpus/BE06/BE06.pkl", 'wb') dump(reader, output, -1) output.close() else: reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt']) output = open("./Corpus/BE06/BE06.pkl", 'wb') dump(reader, output, -1) output.close() #Return corpus self.corpa = reader
from nltk.corpus import brown from nltk.corpus.reader import WordListCorpusReader from nltk.stem.porter import * from nltk.stem import WordNetLemmatizer import re from os import listdir from os.path import isfile, join wnl = WordNetLemmatizer() stemmer = PorterStemmer() tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training" untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged" general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data" l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words() file_names = [ f for f in listdir(untagged_data_filepath) if isfile(join(untagged_data_filepath, f)) ] file_names = file_names[1:] reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]]) corpus = reader.raw() words = reader.words() def get_tags_by_name(corpus, name): return re.findall(r"<" + name + r">.+</" + name + r">", corpus)
# Create a corpus reader with all the files reader = PlaintextCorpusReader('.', files) # Set up a translation table for punctuation to the empty string table = str.maketrans('', '', string.punctuation) # Get a list of English stopwords without punctuation english_stops = set(stopwords.words('english')) english_stops_nopunct = { stopword.translate(table) for stopword in english_stops } # Load the insect wordlist of stems insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt']) # A list to hold the frequency data freq_data = [] count = 1 # Read each file in turn for file in files: text = reader.raw(file) print(f'{count}: TOKENISING {file}') # Tokenise and normalise to lowercase tokens = word_tokenize(text.lower()) # Remove all punctuation marks
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
f for f in listdir(corpora + '/golden_test_subset_a') if isfile(join(corpora + '/golden_test_subset_a', f)) ] onlyfilessbsa2 = [ f for f in listdir(corpora + '/golden_tagged_subset_a') if isfile(join(corpora + '/golden_tagged_subset_a', f)) ] testc = nltk.corpus.reader.plaintext.PlaintextCorpusReader( corpora + '/golden_test_subset_a', onlyfilessbsa1) tagdc = nltk.corpus.reader.plaintext.PlaintextCorpusReader( corpora + '/golden_tagged_subset_a', onlyfilessbsa2) # getting named entity corpora names = WordListCorpusReader( nepath, ['male.txt', 'female.txt', 'family.txt']) # list of names, from canvas titles = WordListCorpusReader(nepath, ['titles.txt']) # list of common titles orgsuffs = WordListCorpusReader( nepath, ['orgsuff.txt']) # list of organisation suffixes daymonths = WordListCorpusReader(nepath, ['daymonths.txt']) # list of days and months # extracting named entities from tagged data # regex patterns to match each tag pattern1 = '<ENAMEX TYPE="PERSON">(.*?)<\/ENAMEX>' pattern2 = '<ENAMEX TYPE="LOCATION">(.*?)<\/ENAMEX>' pattern3 = '<ENAMEX TYPE="ORGANIZATION">(.*?)<\/ENAMEX>' # finding every example in the data, storing in sets people = set(re.findall(pattern1, trainingcorpus.raw()))
import os import re import sys import json import nltk.test import abbreviations import portuguese_tagger_processor from sentilex import sentiLexPairRdd from nltk.corpus.reader import WordListCorpusReader __output_path = "result.json" stopwords = nltk.corpus.stopwords.words('portuguese') reader = WordListCorpusReader('.', ['symbols.txt']) symbols = reader.words() reader = WordListCorpusReader('.', ['positive_emoticons.txt']) positive_emoticons = reader.words() reader = WordListCorpusReader('.', ['negative_emoticons.txt']) negative_emoticons = reader.words() tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer() tagger = portuguese_tagger_processor.get_tagger() json_result = [] tweet_dict = {} def count_positive_emoticons(tokens): counter = 0 for emoticon in positive_emoticons: if emoticon in tokens: counter += 1
import nltk from nltk import load_parser from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader('', ['words.txt']) words = [nltk.word_tokenize(i) for i in reader.words()] cp = load_parser('grammar.fcfg', trace=1) # from nltk.corpus import treebank # from nltk.tag import DefaultTagger # train_set = treebank.tagged_sents()[:4000] # test_set = treebank.tagged_sents()[2000:] # from nltk.tag import UnigramTagger # unigramTagger = UnigramTagger(train_set) # from nltk.tag import BigramTagger, TrigramTagger # bigramTagger = BigramTagger(train_set, cutoff=2) # trigramTagger = TrigramTagger(train_set, cutoff=3) # def backoff_tagger(train_sents, tagger_classes, backoff=None): # for cls in tagger_classes : # backoff = cls(train_sents, backoff=backoff) # return backoff # tagger = backoff_tagger(train_set, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN')) # for sentence in words: # print(tagger.tag(sentence)) for sentence in words: print(sentence) for tree in cp.parse(sentence): print(tree)
#!/usr/bin/env python # encoding: utf-8 """ wordNet.py Created by Aaron Erlich on 2013-02-13. """ import sys import os import nltk from nltk.corpus.reader import WordListCorpusReader path = #insert your path #path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords" reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list crime = reader.words() crime = [word.lower().strip() for word in crime] from nltk.corpus import wordnet #lemmas are the distinct meaning of the a word and all of each meaning's possible morphologies #we see that lots of the student's words have both noun and verb meanings. Which does he care about? #these words are polysemous -- they have similar but different meanings for word in crime: print word print wordnet.synsets(word) print "\n" raw_input("Hit Enter") [synset.lemma_names for synset in wordnet.synsets("stealing")]
def open_places_wordlist(): path = '/Users/tim/mycode/time/wordlists/' wordlist = 'ga_gazetteer_wordlist.txt' reader = WordListCorpusReader(path, [wordlist]) return reader