示例#1
0
    def __init__(self,
                 root,
                 fileids=None,
                 encoding='utf8',
                 skip_keywords=None,
                 **kwargs):
        """

        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """
        if not fileids:
            fileids = r'.*\.txt'

        # Initialize the NLTK corpus reader objects
        PlaintextCorpusReader.__init__(self, root, fileids, encoding)
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
示例#2
0
def create_spacy_corpus(text_corpus: PlaintextCorpusReader,
                        lang: Language) -> Corpus:
    data = ((text_corpus.raw(fid), {
        'fileid': fid
    }) for fid in text_corpus.fileids())
    corpus = Corpus(lang, data)
    return corpus
示例#3
0
 def __init__(self, sep="/", 
              # Note that . needs to be escaped
              pattern = chinese_pattern,
              root=None, fileids=None):
     """docstring for __init__"""
     PlaintextCorpusReader.__init__(
         self,
         sep=sep, root=root, fileids=fileids,
         sent_tokenizer = RegexpTokenizer(pattern, gaps=True),
         encoding="utf-8")
 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
     # Create the cache directory if necessary.
     if not os.path.exists(cache_dir):
         os.mkdir(cache_dir)
     self.cache_dir = cache_dir
     self._verbosity = verbosity
     if input is not None:
         self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
     else:
         self.input_reader = None
示例#5
0
def create_text_corpus_from_zipfile(
        zf: ZipFile,
        pattern='.*\.txt',
        ensure_loaded=True) -> PlaintextCorpusReader:
    '''
    Loads a text corpus contained in a zipfile.
    '''
    pointer = ZipFilePathPointer(zf)
    corpus = PlaintextCorpusReader(pointer, pattern)

    if ensure_loaded:
        corpus.ensure_loaded()

    return corpus
示例#6
0
 def __init__(
         self,
         sep="/",
         # Note that . needs to be escaped
         pattern=chinese_pattern,
         root=None,
         fileids=None):
     """docstring for __init__"""
     PlaintextCorpusReader.__init__(self,
                                    sep=sep,
                                    root=root,
                                    fileids=fileids,
                                    sent_tokenizer=RegexpTokenizer(
                                        pattern, gaps=True),
                                    encoding="utf-8")
示例#7
0
def load_sentences(text_file, stopwords, lang):
    path, f = ntsplit(text_file)
    reader = PlaintextCorpusReader(path, f)
    sentences = [sent for sent in reader.sents()]
    clean = []
    originalSentenceOf = {}
    if lang == "fr":
        stemmer = FrenchStemmer()
    elif lang == "en":
        stemmer = SnowballStemmer("english")
    # Data cleansing
    for sent in sentences:
        s = stemmize(stemmer, sent, stopwords)
        clean.append(" ".join(s))
        originalSentenceOf[clean[-1]] = sent
    setClean = set(clean)
    return setClean, originalSentenceOf, sentences, clean
示例#8
0
def get_emails(path, file_name=False):
    """
    Returns a list of readers for all the files in the path
    """

    full_path = getcwd() + path
    files = [
        file for file in listdir(full_path) if isfile(join(full_path, file))
    ]

    if file_name:
        readers = [(file, PlaintextCorpusReader(full_path, file).raw())
                   for file in files]
        return readers

    readers = [PlaintextCorpusReader(full_path, file).raw() for file in files]
    return readers
示例#9
0
    def __init__(self,
                 root,
                 fields=DOC_PATTERN,
                 sent_pattern=SENT_PATTERN,
                 encoding='utf8',
                 **kargs):
        """
        :param root: corpusが入っているdir
        :param fields: 対象となるcorpus
        :param encoding:
        """

        PlaintextCorpusReader.__init__(
            self,
            root,
            fields,
            word_tokenizer=JanomeTokenizer(),
            sent_tokenizer=RegexpTokenizer(sent_pattern),
            encoding=encoding)
示例#10
0
 def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
   # Create the cache directory if necessary.
   if not os.path.exists(cache_dir):
     os.mkdir(cache_dir)
   self.cache_dir = cache_dir
   self._verbosity = verbosity
   if input is not None:
     self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
   else:
     self.input_reader = None
示例#11
0
def reader(ctx):
    u"""
        def __init__(self, root, fileids,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 para_block_reader=read_blankline_block,
                 encoding=None):
    """
    reader = PlaintextCorpusReader(ctx.textdatadir(), '.*.txt')
    return reader
def enronCorpus():
    #get all fileids
    file_id_list = []
    for relation in os.listdir(corpus_dir):
        if (os.path.isfile(os.path.join(corpus_dir, relation))):
            tmp_corpus_file = os.path.join(corpus_dir, relation)
            file_id_list.append(relation)
    #make a corpus
    corpus = PlaintextCorpusReader(corpus_dir, file_id_list)

    return corpus
示例#13
0
 def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
              **kwargs):
     """
     :param root: The file root of the corpus directory
     :param fileids: the list of file ids to consider, or wildcard expression
     :param skip_keywords: a list of words which indicate whole paragraphs that should
     be skipped by the paras and words methods()
     :param encoding: utf8
     :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
     word_tokenizer.
     """
     # Initialize the NLTK corpus reader objects
     PlaintextCorpusReader.__init__(self, root, fileids, encoding)
     # CorpusReader.__init__(self, root, fileids, encoding)
     if 'sent_tokenizer' in kwargs:
         self._sent_tokenizer = kwargs['sent_tokenizer']
     if 'word_tokenizer' in kwargs:
         self._word_tokenizer = kwargs['word_tokenizer']
     if 'pos_tagger' in kwargs:
         self.pos_tagger = kwargs['pos_tagger']
def untagged_reading(path=''):

    """
    Read the untaged data

    :param path: the root of the directory where the files are located
    :return : a corpus containing all the words in the loaded files

    """

    word_list = PlaintextCorpusReader(path, '.*\.txt')
    return word_list
示例#15
0
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word2idx(w)
            y[i] = word2idx(next_word_list[index % len(next_word_list)])
            index = index + 1
        yield x, y

if __name__ == "__main__":
    directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/'
    corpus_dir = directory + 'corpus/'
    examples = directory + 'examples.txt'
    vocabulary = directory + 'vocab.txt'
    
    w_t = RegexpTokenizer("[\u0980-\u09FF']+")
    corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t)
    
    text_in_words = []
    files = corpus.fileids()
    for f in files:    
        words_in_doc = corpus.words(f)
        text_in_words.append(words_in_doc)
    text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words]
    
    words = []
    for doc in text_in_words:
        for word in doc:
            words.append(word)
    words = sorted(set(words))
    print_vocabulary(vocabulary, words)
    
示例#16
0

DEMO_DATA_ROOT = "../../../RepositoryData/data"

## Loading Corpus Raw Texts

import nltk
from nltk.corpus.reader import PlaintextCorpusReader
import numpy as np
import jieba, re

jieba.set_dictionary(DEMO_DATA_ROOT + "/jiaba/dict.txt.big.txt")

corpus_dir = DEMO_DATA_ROOT+"/TaiwanPresidentialInaugarationSpeech_en"

twp = PlaintextCorpusReader(corpus_dir, ".*\.txt")

len(twp.raw())


## Word Segmentation

- Try two methods: `ckiptagger` vs. `jieba`

from ckiptagger import WS

```{margin}
```{note}
Please remember to download the CKIP model files and change the path accordingly.
```
```
def training_data(paths=None, file_count=0):

    """
        Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX>
        in order to extract the bits of text containing the relevant information and
        group them into a list
        Chunk the elements of the list leaving only a tuple reprezented by the type of the entity
        and its name

        :param paths          the paths towards the file containing the training data
        :param file_count     the number of files to read
        :return               a list of lists where each element is a list formed from the type of the entity and its ful name
    """

    # extract training data from WSJ
    # pattern : the general pattern of a tag
    # snd_pattern : the approximate pattern of the desired information from the tag
    pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII)
    snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII)

    # the strings representing the tags extracted from the files

    text = PlaintextCorpusReader(paths[0], '.*\.txt')

    data = []
    for fid in text.fileids():
        data = data + pattern.findall(text.raw(fileids=fid),re.ASCII)

    # from every tag form the list find the two sub-strings
    # that correspond to the snd_pattern
    # use sets to eliminate redundancy
    raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)]))))))

    # extract data from names folders
    del data
    data = PlaintextCorpusReader(paths[1], '.*')

    name_data = data.words('names.male') + data.words('names.female') + data.words('names.family')

    # extract the most common 350 organization tokens

    organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities))))

    organization_specific_tokens = []
    for wl in organization_words:
        organization_specific_tokens += wl

    organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350)))

    location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities))))
    location_specific_tokens = []
    for wl in location_words:
        location_specific_tokens += wl

    location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350)))

    # put the names in a dictionary for quicker access
    name_dict = {}
    for n in list(set(name_data + names.words())):
        if n.lower()[0] in name_dict:
            name_dict[n.lower()[0]] += [n.lower()]
        else:
            name_dict[n.lower()[0]] = [n.lower()]

    # put the location data in a dictionary for quicker access
    loc_dict = {}
    for l in location_specific_tokens[1:]:
        if l[0] in loc_dict:
            loc_dict[l[0]] += [l]
        else:
            loc_dict[l[0]] = [l]

    # put the organization data in a dictionary for quicker access
    org_dict = {}
    for o in organization_specific_tokens:
        if o[0] in org_dict:
            org_dict[o[0]] += [o]
        else:
            org_dict[o[0]] = [o]

    entity_dict1 = {
        'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))),
        'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))),
        'ORGANIZATION': list(
            map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities))))
    }

    entity_dict2 = {}
    for l in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        entity_dict2[l] = {}
        for e in entity_dict1[l]:
            if e[0] in entity_dict2[l]:
                entity_dict2[l][e[0]] += [e]
            else:
                entity_dict2[l][e[0]] = [e]

    return entity_dict2, org_dict, name_dict, loc_dict
''' Script to tag text files and write them to an output directory '''

import os
directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/"
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
示例#19
0
if __name__ == '__main__':
    opts = docopt(__doc__)

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # load the data
    # WORK HERE!! LOAD YOUR EVALUATION CORPUS
    #sents = gutenberg.sents('austen-persuasion.txt')
    corpora_dir = find(os.path.join(os.getcwd(), 'corpora'))
    custom_tokenizer = RegexpTokenizer('[^.!?]+')
    reader = PlaintextCorpusReader(corpora_dir,
                                   '.*\.txt',
                                   sent_tokenizer=custom_tokenizer)
    sents = reader.sents('test-utf8.txt')

    # load the model
    filename = opts['-i']
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    # compute the cross entropy
    # WORK HERE!!
    log_prob = model.log_prob(sents)
    e = model.cross_entropy(sents)
    p = model.perplexity(sents)
示例#20
0
        if status.lang == "en":
            file = open(
                f"C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/tweets_{topic}.txt",
                "a",
                encoding="utf-8")
            file.write(status.full_text)
            file.close()

reader = CategorizedPlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus",
    r'tweets_.*\.txt',
    cat_pattern=r'tweets_(\w+)\.txt')

# setting up stopwords
stopword_reader = PlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/",
    r'.*\.txt',
    encoding='latin-1')
stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"])

for file in stopword_reader.fileids():
    stops = stopword_reader.raw(file).replace("\n", ",").split(",")
    for word in stops:
        stop_words.add(word)

# text wrangling functions:


def remove_emoji(
    string
):  # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile(
示例#21
0
import nltk
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader import PlaintextCorpusReader
from os import listdir
from os.path import isfile, join
from tag import tag_data, save_tagged_data
from evaluation import eval_all
from pickle import load

read = open('data/bestTagger.pkl', 'rb')
tagger = load(read)
read.close()
current = 301
self = []
for c in range(301, 485):  #485
    reader = PlaintextCorpusReader('data/test_untagged/', [str(c) + '.txt'])
    file = open('data/untagged/' + str(c) + '.txt', 'r')
    text = file.read()
    file.close()
    entities = tag_data(text)
    self.append(entities)
    save_tagged_data(text, entities, c)
eval = eval_all(self)
for key, value in eval.items():
    print('***-' + key.upper() + '-***')
    for k, v in value.items():
        print(k + ': ' + str(v * 100) + '%')
示例#22
0
import pyodbc
from random import randint

cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=R0224576\RYANSQLSERVER;DATABASE=FAQ;UID=m097654;Trusted_Connection=yes')
cursor = cnxn.cursor()

data = cursor.execute('select msg from FACT').fetchall()
tokens = nltk.word_tokenize(str(data))
text = nltk.Text(tokens)
nwords = [w.lower() for w in text if w.isalpha()]
text = nltk.Text(nwords)

corpus_root='C:\Python_workspace\FAQ Scripts\corpus'

newcorpus = PlaintextCorpusReader(corpus_root,'.*')
postxt = newcorpus.words('positive-words.txt')
negtxt = newcorpus.words('negative-words.txt')

neglist = []
poslist = []

for i in range(0,len(negtxt)):
	neglist.append('negative')

for i in range(0,len(postxt)):
	poslist.append('positive')

postagged = zip(postxt,poslist)
negtagged = zip(negtxt,neglist)
示例#23
0
import numpy as np
import nltk
import pandas as pd
from nltk.corpus.reader import PlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer

mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt")
vec = CountVectorizer()
indx = 0
lst = []
for i in mycorpus.fileids():
    nlst = mycorpus.raw(i)
    indx = indx + 1
    lst.append(nlst)
corpus = np.array(lst)

#-----------Stop Words---------
vec = CountVectorizer(stop_words="english")
vec.fit(corpus)
#Sparse matrix
X = vec.transform(corpus)
bM = pd.DataFrame(X.toarray(),
                  columns=vec.get_feature_names(),
                  index=mycorpus.fileids()).T
print(type(corpus))
print(corpus)
print(bM)
# bM.to_csv('booleanMatrix.csv')
# # Jaccards similarity
# from sklearn.metrics import jaccard_score
# similarity = []
示例#24
0
The main driver function for data processing, and collecting features.
"""
if __name__ == '__main__':
    t = time.time()  # Initialization
    output = []
    d = cmudict.dict()
    parser = English()

    # get corpus directories
    corpus_root_xml = nltk.data.find(
        'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml')
    corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text'

    # get all xml and plain text files from specified directories
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]
                    # Remove blank lines or colons from the beginning of the abstract
                    if abstract[0] == []:
                        del(abstract[0])
                    else:
                        del(abstract[0][0])

        # Indicate that we've found the abstract
        abstract_found = 1


# Clear the abstracts text file
with open("abstracts.txt", "w") as out_file:
    out_file.write("")

# Create a corpus from the files using NLTK
corpus = PlaintextCorpusReader("./Part1/", ".*\.txt")

# Loop through each file in the corpus
for fileid in corpus._fileids:

    # Set flags to 0
    org_found = 0           # Flag for when the NSF organization name has been found in the file
    amt_found = 0           # Flag for when the award amount has been found in the file
    abstract_found = 0      # Flag for when the abstract has been found in the file

    # Try to loop through each sentence in the file and apply GetOrg and GetAmt functions.
    try:
        for sent in corpus.sents(fileid):
            GetOrg()
            GetAmt()
示例#26
0
class NLTKReader(object):

  ERROR = 0
  WARN = 1
  INFO = 2
  DEBUG = 3

  def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
    # Create the cache directory if necessary.
    if not os.path.exists(cache_dir):
      os.mkdir(cache_dir)
    self.cache_dir = cache_dir
    self._verbosity = verbosity
    if input is not None:
      self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
    else:
      self.input_reader = None


  def _log(self, lvl, msg):
    if lvl <= self._verbosity:
      print msg


  def _is_noun(self, word):
    synonyms = len(wn.synsets(word, NOUN))
    self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word))
    return synonyms > 0


  def _get_cache_file(self, cache_name):
    return os.path.join(self.cache_dir, cache_name)


  def _write_cache(self, cache_name, data):
    cache_file = self._get_cache_file(cache_name)
    self._log(self.INFO, 'writing cache to %s' % cache_file)
    with open(cache_file, 'w') as f:
      f.write(data)


  def _cache_exists(self, cache_name):
    cache_file = self._get_cache_file(cache_name)
    return os.path.exists(cache_file)


  def _read_cache(self, cache_name):
    cache_file = self._get_cache_file(cache_name)
    self._log(self.INFO, 'reading cache from %s' % cache_file)
    return open(cache_file, 'r').read()


  def _check_text_availability(self, text_name):
    if text_name not in self.available_texts():
      raise Exception('No corpus available named "%s".' % text_name)


  def _get_reader_for(self, text_name):
    if text_name in gutenberg.fileids():
      return gutenberg
    else:
      return self.input_reader


  def available_texts(self):
    available = gutenberg.fileids()
    if self.input_reader is not None:
      available = available + self.input_reader.fileids()
    return available


  def text_report(self):
    print '%40s %10s %10s' % ('text', 'words', 'sentences')
    for txt in self.available_texts():
      word_count = len(self.get_words(txt))
      sent_count = len(self.get_sentences(txt))
      print '%40s %10i %10i' % (txt, word_count, sent_count)


  def get_words_from_text(self, text_name):
    self._check_text_availability(text_name)
    words_with_puncuation = self.get_words(text_name)
    # Strip punctuation and make lower case.
    words = [w.lower()
      for w in words_with_puncuation
      if w not in string.punctuation and len(w) > 3]
    # Remove duplicate nouns.
    words = list(set(words))
    self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name))
    return words


  def get_nouns_from_text(self, text_name):
    self._log(self.INFO, '\nGetting nouns from %s' % text_name)
    cache_name = 'nouns_' + text_name
    if self._cache_exists(cache_name):
      nouns = self._read_cache(cache_name).split(',')
    else:
      words = self.get_words_from_text(text_name)
      self._log(self.WARN, 'Noun identification beginning. This might take awhile...')
      self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words))
      tagged_words = pos_tag(words)

      self._log(self.INFO, 'Extracting all non-nouns based on POS tag...')
      nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN']
      self._log(self.INFO, '\t%i left' % len(nouns))

      self._log(self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...')
      nouns = [ noun for noun in nouns if self._is_noun(noun) ]
      self._log(self.INFO, '\t%i left' % len(nouns))

      self._write_cache(cache_name, ','.join(nouns))

    self._log(self.INFO, 'Found %i total nouns from %s' \
      % (len(nouns), text_name))
    return nouns


  def get_noun_pairs_from_all_texts(self):
    """Retrieves all nouns from the NLTK corpus of texts."""
    singulars = []
    for text in self.available_texts():
      singulars += self.get_nouns_from_text(text)
    singulars = list(set(singulars))
    return [(singular, plural(singular)) for singular in singulars]


  def get_words(self, text_name):
    self._check_text_availability(text_name)
    return self._get_reader_for(text_name).words(text_name)


  def get_sentences(self, text_name):
    self._check_text_availability(text_name)
    return self._get_reader_for(text_name).sents(text_name)


  def get_tagged_sentences(self, text_name, exclude_punctuation=False):
    for sent in self.get_sentences(text_name):
      if exclude_punctuation:
        sent = [ word for word in sent if not is_punctuation(word) ]
      yield pos_tag(sent)


  def get_parts_of_speech(self, text_name, exclude_punctuation=False):
    self._log(self.INFO, 'Parts of speech extraction beginning. This might take awhile...')
    pos = set()
    for sent in self.get_tagged_sentences(text_name,
                                          exclude_punctuation=exclude_punctuation):
      words, parts = zip(*sent)
      pos.update(parts)
    # String blanks (not sure why there are blanks, but there are sometimes).
    return sorted([ p for p in pos if p is not '' ])


  def get_tag_descriptions(self):
    return tag_descriptions


  def describe_tag(self, tag):
    if tag not in tag_descriptions.keys():
      # Return original tag if we don't know it
      return (tag,tag)
    return tag_descriptions[tag]
class NLTKReader(object):

    ERROR = 0
    WARN = 1
    INFO = 2
    DEBUG = 3

    def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
        # Create the cache directory if necessary.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        self.cache_dir = cache_dir
        self._verbosity = verbosity
        if input is not None:
            self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
        else:
            self.input_reader = None

    def _log(self, lvl, msg):
        if lvl <= self._verbosity:
            print msg

    def _is_noun(self, word):
        synonyms = len(wn.synsets(word, NOUN))
        self._log(self.DEBUG,
                  'found %i noun synonyms for %s' % (synonyms, word))
        return synonyms > 0

    def _get_cache_file(self, cache_name):
        return os.path.join(self.cache_dir, cache_name)

    def _write_cache(self, cache_name, data):
        cache_file = self._get_cache_file(cache_name)
        self._log(self.INFO, 'writing cache to %s' % cache_file)
        with open(cache_file, 'w') as f:
            f.write(data)

    def _cache_exists(self, cache_name):
        cache_file = self._get_cache_file(cache_name)
        return os.path.exists(cache_file)

    def _read_cache(self, cache_name):
        cache_file = self._get_cache_file(cache_name)
        self._log(self.INFO, 'reading cache from %s' % cache_file)
        return open(cache_file, 'r').read()

    def _check_text_availability(self, text_name):
        if text_name not in self.available_texts():
            raise Exception('No corpus available named "%s".' % text_name)

    def _get_reader_for(self, text_name):
        if text_name in gutenberg.fileids():
            return gutenberg
        else:
            return self.input_reader

    def available_texts(self):
        available = gutenberg.fileids()
        if self.input_reader is not None:
            available = available + self.input_reader.fileids()
        return available

    def text_report(self):
        print '%40s %10s %10s' % ('text', 'words', 'sentences')
        for txt in self.available_texts():
            word_count = len(self.get_words(txt))
            sent_count = len(self.get_sentences(txt))
            print '%40s %10i %10i' % (txt, word_count, sent_count)

    def get_words_from_text(self, text_name):
        self._check_text_availability(text_name)
        words_with_puncuation = self.get_words(text_name)
        # Strip punctuation and make lower case.
        words = [
            w.lower() for w in words_with_puncuation
            if w not in string.punctuation and len(w) > 3
        ]
        # Remove duplicate nouns.
        words = list(set(words))
        self._log(self.INFO,
                  'Found %i unique words from %s' % (len(words), text_name))
        return words

    def get_nouns_from_text(self, text_name):
        self._log(self.INFO, '\nGetting nouns from %s' % text_name)
        cache_name = 'nouns_' + text_name
        if self._cache_exists(cache_name):
            nouns = self._read_cache(cache_name).split(',')
        else:
            words = self.get_words_from_text(text_name)
            self._log(
                self.WARN,
                'Noun identification beginning. This might take awhile...')
            self._log(self.INFO,
                      'Tagging part of speech for %i words...' % len(words))
            tagged_words = pos_tag(words)

            self._log(self.INFO,
                      'Extracting all non-nouns based on POS tag...')
            nouns = [
                word for word, pos in tagged_words
                if len(word) > 2 and pos == 'NN'
            ]
            self._log(self.INFO, '\t%i left' % len(nouns))

            self._log(
                self.INFO,
                'Extracting further non-nouns based on Wordnet synonyms...')
            nouns = [noun for noun in nouns if self._is_noun(noun)]
            self._log(self.INFO, '\t%i left' % len(nouns))

            self._write_cache(cache_name, ','.join(nouns))

        self._log(self.INFO, 'Found %i total nouns from %s' \
          % (len(nouns), text_name))
        return nouns

    def get_noun_pairs_from_all_texts(self):
        """Retrieves all nouns from the NLTK corpus of texts."""
        singulars = []
        for text in self.available_texts():
            singulars += self.get_nouns_from_text(text)
        singulars = list(set(singulars))
        return [(singular, plural(singular)) for singular in singulars]

    def get_words(self, text_name):
        self._check_text_availability(text_name)
        return self._get_reader_for(text_name).words(text_name)

    def get_sentences(self, text_name):
        self._check_text_availability(text_name)
        return self._get_reader_for(text_name).sents(text_name)

    def get_tagged_sentences(self, text_name, exclude_punctuation=False):
        for sent in self.get_sentences(text_name):
            if exclude_punctuation:
                sent = [word for word in sent if not is_punctuation(word)]
            yield pos_tag(sent)

    def get_parts_of_speech(self, text_name, exclude_punctuation=False):
        self._log(
            self.INFO,
            'Parts of speech extraction beginning. This might take awhile...')
        pos = set()
        for sent in self.get_tagged_sentences(
                text_name, exclude_punctuation=exclude_punctuation):
            words, parts = zip(*sent)
            pos.update(parts)
        # String blanks (not sure why there are blanks, but there are sometimes).
        return sorted([p for p in pos if p is not ''])

    def get_tag_descriptions(self):
        return tag_descriptions

    def describe_tag(self, tag):
        if tag not in tag_descriptions.keys():
            # Return original tag if we don't know it
            return (tag, tag)
        return tag_descriptions[tag]
示例#28
0
from nltk.corpus.reader import TaggedCorpusReader, WordListCorpusReader, ChunkedCorpusReader, PlaintextCorpusReader
from nltk.tokenize import SpaceTokenizer, sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import gutenberg

## Corpus example ############################
sample = gutenberg.raw("bible-kjv.txt")
sent = sent_tokenize(sample)

for x in range(5):
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
示例#29
0
import pandas as pd
import matplotlib.pyplot as plt

start = time()
now = datetime.now()
print(f'Started at {now}...')

# Create list of files to be read
data_path = os.path.join('corpora', 'bughunt', '2-clean-by-decade')
files = [
    os.path.join(root, filename) for root, _, files in os.walk(data_path)
    for filename in files
]

# Create a corpus reader with all the files
reader = PlaintextCorpusReader('.', files)

# Set up a translation table for punctuation to the empty string
table = str.maketrans('', '', string.punctuation)

# Get a list of English stopwords without punctuation
english_stops = set(stopwords.words('english'))
english_stops_nopunct = {
    stopword.translate(table)
    for stopword in english_stops
}

# Load the insect wordlist of stems
insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt'])

# A list to hold the frequency data