def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ if not fileids: fileids = r'.*\.txt' # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] self.skip_keywords = skip_keywords
def create_spacy_corpus(text_corpus: PlaintextCorpusReader, lang: Language) -> Corpus: data = ((text_corpus.raw(fid), { 'fileid': fid }) for fid in text_corpus.fileids()) corpus = Corpus(lang, data) return corpus
def __init__(self, sep="/", # Note that . needs to be escaped pattern = chinese_pattern, root=None, fileids=None): """docstring for __init__""" PlaintextCorpusReader.__init__( self, sep=sep, root=root, fileids=fileids, sent_tokenizer = RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0): # Create the cache directory if necessary. if not os.path.exists(cache_dir): os.mkdir(cache_dir) self.cache_dir = cache_dir self._verbosity = verbosity if input is not None: self.input_reader = PlaintextCorpusReader(input, '.*\.txt') else: self.input_reader = None
def create_text_corpus_from_zipfile( zf: ZipFile, pattern='.*\.txt', ensure_loaded=True) -> PlaintextCorpusReader: ''' Loads a text corpus contained in a zipfile. ''' pointer = ZipFilePathPointer(zf) corpus = PlaintextCorpusReader(pointer, pattern) if ensure_loaded: corpus.ensure_loaded() return corpus
def __init__( self, sep="/", # Note that . needs to be escaped pattern=chinese_pattern, root=None, fileids=None): """docstring for __init__""" PlaintextCorpusReader.__init__(self, sep=sep, root=root, fileids=fileids, sent_tokenizer=RegexpTokenizer( pattern, gaps=True), encoding="utf-8")
def load_sentences(text_file, stopwords, lang): path, f = ntsplit(text_file) reader = PlaintextCorpusReader(path, f) sentences = [sent for sent in reader.sents()] clean = [] originalSentenceOf = {} if lang == "fr": stemmer = FrenchStemmer() elif lang == "en": stemmer = SnowballStemmer("english") # Data cleansing for sent in sentences: s = stemmize(stemmer, sent, stopwords) clean.append(" ".join(s)) originalSentenceOf[clean[-1]] = sent setClean = set(clean) return setClean, originalSentenceOf, sentences, clean
def get_emails(path, file_name=False): """ Returns a list of readers for all the files in the path """ full_path = getcwd() + path files = [ file for file in listdir(full_path) if isfile(join(full_path, file)) ] if file_name: readers = [(file, PlaintextCorpusReader(full_path, file).raw()) for file in files] return readers readers = [PlaintextCorpusReader(full_path, file).raw() for file in files] return readers
def __init__(self, root, fields=DOC_PATTERN, sent_pattern=SENT_PATTERN, encoding='utf8', **kargs): """ :param root: corpusが入っているdir :param fields: 対象となるcorpus :param encoding: """ PlaintextCorpusReader.__init__( self, root, fields, word_tokenizer=JanomeTokenizer(), sent_tokenizer=RegexpTokenizer(sent_pattern), encoding=encoding)
def reader(ctx): u""" def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), para_block_reader=read_blankline_block, encoding=None): """ reader = PlaintextCorpusReader(ctx.textdatadir(), '.*.txt') return reader
def enronCorpus(): #get all fileids file_id_list = [] for relation in os.listdir(corpus_dir): if (os.path.isfile(os.path.join(corpus_dir, relation))): tmp_corpus_file = os.path.join(corpus_dir, relation) file_id_list.append(relation) #make a corpus corpus = PlaintextCorpusReader(corpus_dir, file_id_list) return corpus
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None, **kwargs): """ :param root: The file root of the corpus directory :param fileids: the list of file ids to consider, or wildcard expression :param skip_keywords: a list of words which indicate whole paragraphs that should be skipped by the paras and words methods() :param encoding: utf8 :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer, word_tokenizer. """ # Initialize the NLTK corpus reader objects PlaintextCorpusReader.__init__(self, root, fileids, encoding) # CorpusReader.__init__(self, root, fileids, encoding) if 'sent_tokenizer' in kwargs: self._sent_tokenizer = kwargs['sent_tokenizer'] if 'word_tokenizer' in kwargs: self._word_tokenizer = kwargs['word_tokenizer'] if 'pos_tagger' in kwargs: self.pos_tagger = kwargs['pos_tagger']
def untagged_reading(path=''): """ Read the untaged data :param path: the root of the directory where the files are located :return : a corpus containing all the words in the loaded files """ word_list = PlaintextCorpusReader(path, '.*\.txt') return word_list
y = np.zeros((batch_size), dtype=np.int32) for i in range(batch_size): for t, w in enumerate(sentence_list[index % len(sentence_list)]): x[i, t] = word2idx(w) y[i] = word2idx(next_word_list[index % len(next_word_list)]) index = index + 1 yield x, y if __name__ == "__main__": directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/' corpus_dir = directory + 'corpus/' examples = directory + 'examples.txt' vocabulary = directory + 'vocab.txt' w_t = RegexpTokenizer("[\u0980-\u09FF']+") corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t) text_in_words = [] files = corpus.fileids() for f in files: words_in_doc = corpus.words(f) text_in_words.append(words_in_doc) text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words] words = [] for doc in text_in_words: for word in doc: words.append(word) words = sorted(set(words)) print_vocabulary(vocabulary, words)
DEMO_DATA_ROOT = "../../../RepositoryData/data" ## Loading Corpus Raw Texts import nltk from nltk.corpus.reader import PlaintextCorpusReader import numpy as np import jieba, re jieba.set_dictionary(DEMO_DATA_ROOT + "/jiaba/dict.txt.big.txt") corpus_dir = DEMO_DATA_ROOT+"/TaiwanPresidentialInaugarationSpeech_en" twp = PlaintextCorpusReader(corpus_dir, ".*\.txt") len(twp.raw()) ## Word Segmentation - Try two methods: `ckiptagger` vs. `jieba` from ckiptagger import WS ```{margin} ```{note} Please remember to download the CKIP model files and change the path accordingly. ``` ```
def training_data(paths=None, file_count=0): """ Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX> in order to extract the bits of text containing the relevant information and group them into a list Chunk the elements of the list leaving only a tuple reprezented by the type of the entity and its name :param paths the paths towards the file containing the training data :param file_count the number of files to read :return a list of lists where each element is a list formed from the type of the entity and its ful name """ # extract training data from WSJ # pattern : the general pattern of a tag # snd_pattern : the approximate pattern of the desired information from the tag pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII) snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII) # the strings representing the tags extracted from the files text = PlaintextCorpusReader(paths[0], '.*\.txt') data = [] for fid in text.fileids(): data = data + pattern.findall(text.raw(fileids=fid),re.ASCII) # from every tag form the list find the two sub-strings # that correspond to the snd_pattern # use sets to eliminate redundancy raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)])))))) # extract data from names folders del data data = PlaintextCorpusReader(paths[1], '.*') name_data = data.words('names.male') + data.words('names.female') + data.words('names.family') # extract the most common 350 organization tokens organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities)))) organization_specific_tokens = [] for wl in organization_words: organization_specific_tokens += wl organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350))) location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities)))) location_specific_tokens = [] for wl in location_words: location_specific_tokens += wl location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350))) # put the names in a dictionary for quicker access name_dict = {} for n in list(set(name_data + names.words())): if n.lower()[0] in name_dict: name_dict[n.lower()[0]] += [n.lower()] else: name_dict[n.lower()[0]] = [n.lower()] # put the location data in a dictionary for quicker access loc_dict = {} for l in location_specific_tokens[1:]: if l[0] in loc_dict: loc_dict[l[0]] += [l] else: loc_dict[l[0]] = [l] # put the organization data in a dictionary for quicker access org_dict = {} for o in organization_specific_tokens: if o[0] in org_dict: org_dict[o[0]] += [o] else: org_dict[o[0]] = [o] entity_dict1 = { 'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))), 'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))), 'ORGANIZATION': list( map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities)))) } entity_dict2 = {} for l in ['PERSON', 'ORGANIZATION', 'LOCATION']: entity_dict2[l] = {} for e in entity_dict1[l]: if e[0] in entity_dict2[l]: entity_dict2[l][e[0]] += [e] else: entity_dict2[l][e[0]] = [e] return entity_dict2, org_dict, name_dict, loc_dict
''' Script to tag text files and write them to an output directory ''' import os directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/" input_directory = directory + "Input/_Product_Management/" output_directory = directory + "1_POS/" if not os.path.exists(output_directory): os.mkdir(output_directory) # reading stuff file_list = os.listdir(input_directory) print file_list # just for testing create a corpus reader from nltk.corpus.reader import PlaintextCorpusReader reader = PlaintextCorpusReader(input_directory,'.*.txt') reader.fileids() reader.raw() reader.sents() reader.words() ## default POS tagger from NLTK ## import nltk # import pprint # sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') pos = "nltk" path = output_directory + pos if not os.path.exists(path): os.mkdir(path) for i in range(len(file_list)): # posting = []
if __name__ == '__main__': opts = docopt(__doc__) # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # load the data # WORK HERE!! LOAD YOUR EVALUATION CORPUS #sents = gutenberg.sents('austen-persuasion.txt') corpora_dir = find(os.path.join(os.getcwd(), 'corpora')) custom_tokenizer = RegexpTokenizer('[^.!?]+') reader = PlaintextCorpusReader(corpora_dir, '.*\.txt', sent_tokenizer=custom_tokenizer) sents = reader.sents('test-utf8.txt') # load the model filename = opts['-i'] f = open(filename, 'rb') model = pickle.load(f) f.close() # compute the cross entropy # WORK HERE!! log_prob = model.log_prob(sents) e = model.cross_entropy(sents) p = model.perplexity(sents)
if status.lang == "en": file = open( f"C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/tweets_{topic}.txt", "a", encoding="utf-8") file.write(status.full_text) file.close() reader = CategorizedPlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus", r'tweets_.*\.txt', cat_pattern=r'tweets_(\w+)\.txt') # setting up stopwords stopword_reader = PlaintextCorpusReader( "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/", r'.*\.txt', encoding='latin-1') stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"]) for file in stopword_reader.fileids(): stops = stopword_reader.raw(file).replace("\n", ",").split(",") for word in stops: stop_words.add(word) # text wrangling functions: def remove_emoji( string ): # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b emoji_pattern = re.compile(
import nltk from nltk.corpus.reader import WordListCorpusReader from nltk.corpus.reader import PlaintextCorpusReader from os import listdir from os.path import isfile, join from tag import tag_data, save_tagged_data from evaluation import eval_all from pickle import load read = open('data/bestTagger.pkl', 'rb') tagger = load(read) read.close() current = 301 self = [] for c in range(301, 485): #485 reader = PlaintextCorpusReader('data/test_untagged/', [str(c) + '.txt']) file = open('data/untagged/' + str(c) + '.txt', 'r') text = file.read() file.close() entities = tag_data(text) self.append(entities) save_tagged_data(text, entities, c) eval = eval_all(self) for key, value in eval.items(): print('***-' + key.upper() + '-***') for k, v in value.items(): print(k + ': ' + str(v * 100) + '%')
import pyodbc from random import randint cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=R0224576\RYANSQLSERVER;DATABASE=FAQ;UID=m097654;Trusted_Connection=yes') cursor = cnxn.cursor() data = cursor.execute('select msg from FACT').fetchall() tokens = nltk.word_tokenize(str(data)) text = nltk.Text(tokens) nwords = [w.lower() for w in text if w.isalpha()] text = nltk.Text(nwords) corpus_root='C:\Python_workspace\FAQ Scripts\corpus' newcorpus = PlaintextCorpusReader(corpus_root,'.*') postxt = newcorpus.words('positive-words.txt') negtxt = newcorpus.words('negative-words.txt') neglist = [] poslist = [] for i in range(0,len(negtxt)): neglist.append('negative') for i in range(0,len(postxt)): poslist.append('positive') postagged = zip(postxt,poslist) negtagged = zip(negtxt,neglist)
import numpy as np import nltk import pandas as pd from nltk.corpus.reader import PlaintextCorpusReader from sklearn.feature_extraction.text import CountVectorizer mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt") vec = CountVectorizer() indx = 0 lst = [] for i in mycorpus.fileids(): nlst = mycorpus.raw(i) indx = indx + 1 lst.append(nlst) corpus = np.array(lst) #-----------Stop Words--------- vec = CountVectorizer(stop_words="english") vec.fit(corpus) #Sparse matrix X = vec.transform(corpus) bM = pd.DataFrame(X.toarray(), columns=vec.get_feature_names(), index=mycorpus.fileids()).T print(type(corpus)) print(corpus) print(bM) # bM.to_csv('booleanMatrix.csv') # # Jaccards similarity # from sklearn.metrics import jaccard_score # similarity = []
The main driver function for data processing, and collecting features. """ if __name__ == '__main__': t = time.time() # Initialization output = [] d = cmudict.dict() parser = English() # get corpus directories corpus_root_xml = nltk.data.find( 'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml') corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text' # get all xml and plain text files from specified directories corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml') corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha') # get all the words spoken by a child all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])] # init wordnet and language model corpus_ic = wn.ic(corpus_xml, True, 1.0) lm = LanguageModel(all_words) # collect all the features for each corpus for j in range(len(corpus_xml.fileids())): current_features = [] # init empty array to store features # Text initialization text_xml = corpus_xml.fileids()[j] text_plain = corpus_plain.fileids()[j]
# Remove blank lines or colons from the beginning of the abstract if abstract[0] == []: del(abstract[0]) else: del(abstract[0][0]) # Indicate that we've found the abstract abstract_found = 1 # Clear the abstracts text file with open("abstracts.txt", "w") as out_file: out_file.write("") # Create a corpus from the files using NLTK corpus = PlaintextCorpusReader("./Part1/", ".*\.txt") # Loop through each file in the corpus for fileid in corpus._fileids: # Set flags to 0 org_found = 0 # Flag for when the NSF organization name has been found in the file amt_found = 0 # Flag for when the award amount has been found in the file abstract_found = 0 # Flag for when the abstract has been found in the file # Try to loop through each sentence in the file and apply GetOrg and GetAmt functions. try: for sent in corpus.sents(fileid): GetOrg() GetAmt()
class NLTKReader(object): ERROR = 0 WARN = 1 INFO = 2 DEBUG = 3 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0): # Create the cache directory if necessary. if not os.path.exists(cache_dir): os.mkdir(cache_dir) self.cache_dir = cache_dir self._verbosity = verbosity if input is not None: self.input_reader = PlaintextCorpusReader(input, '.*\.txt') else: self.input_reader = None def _log(self, lvl, msg): if lvl <= self._verbosity: print msg def _is_noun(self, word): synonyms = len(wn.synsets(word, NOUN)) self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word)) return synonyms > 0 def _get_cache_file(self, cache_name): return os.path.join(self.cache_dir, cache_name) def _write_cache(self, cache_name, data): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'writing cache to %s' % cache_file) with open(cache_file, 'w') as f: f.write(data) def _cache_exists(self, cache_name): cache_file = self._get_cache_file(cache_name) return os.path.exists(cache_file) def _read_cache(self, cache_name): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'reading cache from %s' % cache_file) return open(cache_file, 'r').read() def _check_text_availability(self, text_name): if text_name not in self.available_texts(): raise Exception('No corpus available named "%s".' % text_name) def _get_reader_for(self, text_name): if text_name in gutenberg.fileids(): return gutenberg else: return self.input_reader def available_texts(self): available = gutenberg.fileids() if self.input_reader is not None: available = available + self.input_reader.fileids() return available def text_report(self): print '%40s %10s %10s' % ('text', 'words', 'sentences') for txt in self.available_texts(): word_count = len(self.get_words(txt)) sent_count = len(self.get_sentences(txt)) print '%40s %10i %10i' % (txt, word_count, sent_count) def get_words_from_text(self, text_name): self._check_text_availability(text_name) words_with_puncuation = self.get_words(text_name) # Strip punctuation and make lower case. words = [w.lower() for w in words_with_puncuation if w not in string.punctuation and len(w) > 3] # Remove duplicate nouns. words = list(set(words)) self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name)) return words def get_nouns_from_text(self, text_name): self._log(self.INFO, '\nGetting nouns from %s' % text_name) cache_name = 'nouns_' + text_name if self._cache_exists(cache_name): nouns = self._read_cache(cache_name).split(',') else: words = self.get_words_from_text(text_name) self._log(self.WARN, 'Noun identification beginning. This might take awhile...') self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words)) tagged_words = pos_tag(words) self._log(self.INFO, 'Extracting all non-nouns based on POS tag...') nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN'] self._log(self.INFO, '\t%i left' % len(nouns)) self._log(self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...') nouns = [ noun for noun in nouns if self._is_noun(noun) ] self._log(self.INFO, '\t%i left' % len(nouns)) self._write_cache(cache_name, ','.join(nouns)) self._log(self.INFO, 'Found %i total nouns from %s' \ % (len(nouns), text_name)) return nouns def get_noun_pairs_from_all_texts(self): """Retrieves all nouns from the NLTK corpus of texts.""" singulars = [] for text in self.available_texts(): singulars += self.get_nouns_from_text(text) singulars = list(set(singulars)) return [(singular, plural(singular)) for singular in singulars] def get_words(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).words(text_name) def get_sentences(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).sents(text_name) def get_tagged_sentences(self, text_name, exclude_punctuation=False): for sent in self.get_sentences(text_name): if exclude_punctuation: sent = [ word for word in sent if not is_punctuation(word) ] yield pos_tag(sent) def get_parts_of_speech(self, text_name, exclude_punctuation=False): self._log(self.INFO, 'Parts of speech extraction beginning. This might take awhile...') pos = set() for sent in self.get_tagged_sentences(text_name, exclude_punctuation=exclude_punctuation): words, parts = zip(*sent) pos.update(parts) # String blanks (not sure why there are blanks, but there are sometimes). return sorted([ p for p in pos if p is not '' ]) def get_tag_descriptions(self): return tag_descriptions def describe_tag(self, tag): if tag not in tag_descriptions.keys(): # Return original tag if we don't know it return (tag,tag) return tag_descriptions[tag]
class NLTKReader(object): ERROR = 0 WARN = 1 INFO = 2 DEBUG = 3 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0): # Create the cache directory if necessary. if not os.path.exists(cache_dir): os.mkdir(cache_dir) self.cache_dir = cache_dir self._verbosity = verbosity if input is not None: self.input_reader = PlaintextCorpusReader(input, '.*\.txt') else: self.input_reader = None def _log(self, lvl, msg): if lvl <= self._verbosity: print msg def _is_noun(self, word): synonyms = len(wn.synsets(word, NOUN)) self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word)) return synonyms > 0 def _get_cache_file(self, cache_name): return os.path.join(self.cache_dir, cache_name) def _write_cache(self, cache_name, data): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'writing cache to %s' % cache_file) with open(cache_file, 'w') as f: f.write(data) def _cache_exists(self, cache_name): cache_file = self._get_cache_file(cache_name) return os.path.exists(cache_file) def _read_cache(self, cache_name): cache_file = self._get_cache_file(cache_name) self._log(self.INFO, 'reading cache from %s' % cache_file) return open(cache_file, 'r').read() def _check_text_availability(self, text_name): if text_name not in self.available_texts(): raise Exception('No corpus available named "%s".' % text_name) def _get_reader_for(self, text_name): if text_name in gutenberg.fileids(): return gutenberg else: return self.input_reader def available_texts(self): available = gutenberg.fileids() if self.input_reader is not None: available = available + self.input_reader.fileids() return available def text_report(self): print '%40s %10s %10s' % ('text', 'words', 'sentences') for txt in self.available_texts(): word_count = len(self.get_words(txt)) sent_count = len(self.get_sentences(txt)) print '%40s %10i %10i' % (txt, word_count, sent_count) def get_words_from_text(self, text_name): self._check_text_availability(text_name) words_with_puncuation = self.get_words(text_name) # Strip punctuation and make lower case. words = [ w.lower() for w in words_with_puncuation if w not in string.punctuation and len(w) > 3 ] # Remove duplicate nouns. words = list(set(words)) self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name)) return words def get_nouns_from_text(self, text_name): self._log(self.INFO, '\nGetting nouns from %s' % text_name) cache_name = 'nouns_' + text_name if self._cache_exists(cache_name): nouns = self._read_cache(cache_name).split(',') else: words = self.get_words_from_text(text_name) self._log( self.WARN, 'Noun identification beginning. This might take awhile...') self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words)) tagged_words = pos_tag(words) self._log(self.INFO, 'Extracting all non-nouns based on POS tag...') nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN' ] self._log(self.INFO, '\t%i left' % len(nouns)) self._log( self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...') nouns = [noun for noun in nouns if self._is_noun(noun)] self._log(self.INFO, '\t%i left' % len(nouns)) self._write_cache(cache_name, ','.join(nouns)) self._log(self.INFO, 'Found %i total nouns from %s' \ % (len(nouns), text_name)) return nouns def get_noun_pairs_from_all_texts(self): """Retrieves all nouns from the NLTK corpus of texts.""" singulars = [] for text in self.available_texts(): singulars += self.get_nouns_from_text(text) singulars = list(set(singulars)) return [(singular, plural(singular)) for singular in singulars] def get_words(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).words(text_name) def get_sentences(self, text_name): self._check_text_availability(text_name) return self._get_reader_for(text_name).sents(text_name) def get_tagged_sentences(self, text_name, exclude_punctuation=False): for sent in self.get_sentences(text_name): if exclude_punctuation: sent = [word for word in sent if not is_punctuation(word)] yield pos_tag(sent) def get_parts_of_speech(self, text_name, exclude_punctuation=False): self._log( self.INFO, 'Parts of speech extraction beginning. This might take awhile...') pos = set() for sent in self.get_tagged_sentences( text_name, exclude_punctuation=exclude_punctuation): words, parts = zip(*sent) pos.update(parts) # String blanks (not sure why there are blanks, but there are sometimes). return sorted([p for p in pos if p is not '']) def get_tag_descriptions(self): return tag_descriptions def describe_tag(self, tag): if tag not in tag_descriptions.keys(): # Return original tag if we don't know it return (tag, tag) return tag_descriptions[tag]
from nltk.corpus.reader import TaggedCorpusReader, WordListCorpusReader, ChunkedCorpusReader, PlaintextCorpusReader from nltk.tokenize import SpaceTokenizer, sent_tokenize, word_tokenize, PunktSentenceTokenizer from nltk.corpus import gutenberg ## Corpus example ############################ sample = gutenberg.raw("bible-kjv.txt") sent = sent_tokenize(sample) for x in range(5): print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words())
import pandas as pd import matplotlib.pyplot as plt start = time() now = datetime.now() print(f'Started at {now}...') # Create list of files to be read data_path = os.path.join('corpora', 'bughunt', '2-clean-by-decade') files = [ os.path.join(root, filename) for root, _, files in os.walk(data_path) for filename in files ] # Create a corpus reader with all the files reader = PlaintextCorpusReader('.', files) # Set up a translation table for punctuation to the empty string table = str.maketrans('', '', string.punctuation) # Get a list of English stopwords without punctuation english_stops = set(stopwords.words('english')) english_stops_nopunct = { stopword.translate(table) for stopword in english_stops } # Load the insect wordlist of stems insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt']) # A list to hold the frequency data