示例#1
0
def create_spacy_corpus(text_corpus: PlaintextCorpusReader,
                        lang: Language) -> Corpus:
    data = ((text_corpus.raw(fid), {
        'fileid': fid
    }) for fid in text_corpus.fileids())
    corpus = Corpus(lang, data)
    return corpus
示例#2
0
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]

        # list of words spoken by the child in lowercase
        child_words_xml = [
            w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI'])
        ]

        # list of words spoken by the child in lowercase with replaced words
        child_words_replaced_xml = [
            w.lower()
            for w in corpus_xml.words(text_xml, speaker=['CHI'], replace=True)
        ]

        # list of words spoken by the child in lowercase with the stemmed words
        child_words_stemmed_xml = [
            w.lower()
示例#3
0
## Word Segmentation

- Try two methods: `ckiptagger` vs. `jieba`

from ckiptagger import WS

```{margin}
```{note}
Please remember to download the CKIP model files and change the path accordingly.
```
```

ws = WS("/Users/Alvin/Dropbox/Corpus/CKIP_WordSeg/data")

## Print first 200 chars of file 13
print(twp.raw(fileids=twp.fileids()[13])[:200])

# word-seg the raw text and return a long string
def tokenize_raw1(raw):
    word_tok = [' '.join(para) for para in ws(nltk.regexp_tokenize(raw, r'[^\s]+'))] # para-like units
    raw_tok  = ' '.join(word_tok)
    return raw_tok

# word-seg the raw text and return list of words
def tokenize_raw2(raw):
    para_list = nltk.regexp_tokenize(raw, r'[^\s]+') # para-like units
    word_list = sum(ws(para_list),[]) 
    return word_list


def tokenize_raw3(raw):
def training_data(paths=None, file_count=0):

    """
        Use the general pattern of a tag <ENAMEX\sTYPE=".*?">.*?</ENAMEX>
        in order to extract the bits of text containing the relevant information and
        group them into a list
        Chunk the elements of the list leaving only a tuple reprezented by the type of the entity
        and its name

        :param paths          the paths towards the file containing the training data
        :param file_count     the number of files to read
        :return               a list of lists where each element is a list formed from the type of the entity and its ful name
    """

    # extract training data from WSJ
    # pattern : the general pattern of a tag
    # snd_pattern : the approximate pattern of the desired information from the tag
    pattern = re.compile(r'<.*?TYPE=".*?">.*?</.*?>', re.ASCII)
    snd_pattern = re.compile(r'[>"].*?[<"]', re.ASCII)

    # the strings representing the tags extracted from the files

    text = PlaintextCorpusReader(paths[0], '.*\.txt')

    data = []
    for fid in text.fileids():
        data = data + pattern.findall(text.raw(fileids=fid),re.ASCII)

    # from every tag form the list find the two sub-strings
    # that correspond to the snd_pattern
    # use sets to eliminate redundancy
    raw_entities = list(set(list(map(lambda re: (re[0], re[1].lower()), list(map(lambda x: (x[0], x[1]), [list(map(lambda s: (s[:len(s)-1])[1:], l)) for l in (re.findall(snd_pattern, tag) for tag in data)]))))))

    # extract data from names folders
    del data
    data = PlaintextCorpusReader(paths[1], '.*')

    name_data = data.words('names.male') + data.words('names.female') + data.words('names.family')

    # extract the most common 350 organization tokens

    organization_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'ORGANIZATION', raw_entities))))

    organization_specific_tokens = []
    for wl in organization_words:
        organization_specific_tokens += wl

    organization_specific_tokens = list(map(lambda f: f[0], FreqDist(organization_specific_tokens).most_common(350)))

    location_words = list(map(lambda o: word_tokenize(o[1]), list(filter(lambda x: x[0] == 'LOCATION', raw_entities))))
    location_specific_tokens = []
    for wl in location_words:
        location_specific_tokens += wl

    location_specific_tokens = list(map(lambda f: f[0], FreqDist(location_specific_tokens).most_common(350)))

    # put the names in a dictionary for quicker access
    name_dict = {}
    for n in list(set(name_data + names.words())):
        if n.lower()[0] in name_dict:
            name_dict[n.lower()[0]] += [n.lower()]
        else:
            name_dict[n.lower()[0]] = [n.lower()]

    # put the location data in a dictionary for quicker access
    loc_dict = {}
    for l in location_specific_tokens[1:]:
        if l[0] in loc_dict:
            loc_dict[l[0]] += [l]
        else:
            loc_dict[l[0]] = [l]

    # put the organization data in a dictionary for quicker access
    org_dict = {}
    for o in organization_specific_tokens:
        if o[0] in org_dict:
            org_dict[o[0]] += [o]
        else:
            org_dict[o[0]] = [o]

    entity_dict1 = {
        'PERSON': list(map(lambda p: p[1], list(filter(lambda e: e[0] == 'PERSON', raw_entities)))),
        'LOCATION': list(map(lambda l: l[1], list(filter(lambda e: e[0] == 'LOCATION', raw_entities)))),
        'ORGANIZATION': list(
            map(lambda o: o[1], list(filter(lambda e: e[0] == 'ORGANIZATION', raw_entities))))
    }

    entity_dict2 = {}
    for l in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        entity_dict2[l] = {}
        for e in entity_dict1[l]:
            if e[0] in entity_dict2[l]:
                entity_dict2[l][e[0]] += [e]
            else:
                entity_dict2[l][e[0]] = [e]

    return entity_dict2, org_dict, name_dict, loc_dict
示例#5
0
            file.write(status.full_text)
            file.close()

reader = CategorizedPlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus",
    r'tweets_.*\.txt',
    cat_pattern=r'tweets_(\w+)\.txt')

# setting up stopwords
stopword_reader = PlaintextCorpusReader(
    "C:/Users/olgur/natural_language_toolkit_data/twitter_corpus/twitterstopwords/",
    r'.*\.txt',
    encoding='latin-1')
stop_words = set(['“', '”', '’', ",", "#", "—", "__", "_", "___"])

for file in stopword_reader.fileids():
    stops = stopword_reader.raw(file).replace("\n", ",").split(",")
    for word in stops:
        stop_words.add(word)

# text wrangling functions:


def remove_emoji(
    string
):  # github https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
from nltk.tokenize import SpaceTokenizer, sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import gutenberg

## Corpus example ############################
sample = gutenberg.raw("bible-kjv.txt")
sent = sent_tokenize(sample)

for x in range(5):
    print("Sentence - %s\n" % (sent[x]))
    print("Words - %s\n" % (nltk.word_tokenize(sent[x])))

## Reading corpora from a text files ##########
## No POS tags, chunks or categories ##########
reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg",
                               r'^.*\.txt')
files = reader.fileids()
print("File IDs:", files)
print("Number of files:", len(files))
print(reader.words(files[0]))
print(reader.sents(files[0]))

## Reading tagged corpora #####################
reader = TaggedCorpusReader('/Users/atul/nltk_data',
                            r'brown.pos',
                            tagset='en-brown')
reader1 = TaggedCorpusReader('/Users/atul/nltk_data',
                             r'brown.pos',
                             word_tokenizer=SpaceTokenizer())

print(reader.words())
print(reader.sents())
class NLTKReader(object):

    ERROR = 0
    WARN = 1
    INFO = 2
    DEBUG = 3

    def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
        # Create the cache directory if necessary.
        if not os.path.exists(cache_dir):
            os.mkdir(cache_dir)
        self.cache_dir = cache_dir
        self._verbosity = verbosity
        if input is not None:
            self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
        else:
            self.input_reader = None

    def _log(self, lvl, msg):
        if lvl <= self._verbosity:
            print msg

    def _is_noun(self, word):
        synonyms = len(wn.synsets(word, NOUN))
        self._log(self.DEBUG,
                  'found %i noun synonyms for %s' % (synonyms, word))
        return synonyms > 0

    def _get_cache_file(self, cache_name):
        return os.path.join(self.cache_dir, cache_name)

    def _write_cache(self, cache_name, data):
        cache_file = self._get_cache_file(cache_name)
        self._log(self.INFO, 'writing cache to %s' % cache_file)
        with open(cache_file, 'w') as f:
            f.write(data)

    def _cache_exists(self, cache_name):
        cache_file = self._get_cache_file(cache_name)
        return os.path.exists(cache_file)

    def _read_cache(self, cache_name):
        cache_file = self._get_cache_file(cache_name)
        self._log(self.INFO, 'reading cache from %s' % cache_file)
        return open(cache_file, 'r').read()

    def _check_text_availability(self, text_name):
        if text_name not in self.available_texts():
            raise Exception('No corpus available named "%s".' % text_name)

    def _get_reader_for(self, text_name):
        if text_name in gutenberg.fileids():
            return gutenberg
        else:
            return self.input_reader

    def available_texts(self):
        available = gutenberg.fileids()
        if self.input_reader is not None:
            available = available + self.input_reader.fileids()
        return available

    def text_report(self):
        print '%40s %10s %10s' % ('text', 'words', 'sentences')
        for txt in self.available_texts():
            word_count = len(self.get_words(txt))
            sent_count = len(self.get_sentences(txt))
            print '%40s %10i %10i' % (txt, word_count, sent_count)

    def get_words_from_text(self, text_name):
        self._check_text_availability(text_name)
        words_with_puncuation = self.get_words(text_name)
        # Strip punctuation and make lower case.
        words = [
            w.lower() for w in words_with_puncuation
            if w not in string.punctuation and len(w) > 3
        ]
        # Remove duplicate nouns.
        words = list(set(words))
        self._log(self.INFO,
                  'Found %i unique words from %s' % (len(words), text_name))
        return words

    def get_nouns_from_text(self, text_name):
        self._log(self.INFO, '\nGetting nouns from %s' % text_name)
        cache_name = 'nouns_' + text_name
        if self._cache_exists(cache_name):
            nouns = self._read_cache(cache_name).split(',')
        else:
            words = self.get_words_from_text(text_name)
            self._log(
                self.WARN,
                'Noun identification beginning. This might take awhile...')
            self._log(self.INFO,
                      'Tagging part of speech for %i words...' % len(words))
            tagged_words = pos_tag(words)

            self._log(self.INFO,
                      'Extracting all non-nouns based on POS tag...')
            nouns = [
                word for word, pos in tagged_words
                if len(word) > 2 and pos == 'NN'
            ]
            self._log(self.INFO, '\t%i left' % len(nouns))

            self._log(
                self.INFO,
                'Extracting further non-nouns based on Wordnet synonyms...')
            nouns = [noun for noun in nouns if self._is_noun(noun)]
            self._log(self.INFO, '\t%i left' % len(nouns))

            self._write_cache(cache_name, ','.join(nouns))

        self._log(self.INFO, 'Found %i total nouns from %s' \
          % (len(nouns), text_name))
        return nouns

    def get_noun_pairs_from_all_texts(self):
        """Retrieves all nouns from the NLTK corpus of texts."""
        singulars = []
        for text in self.available_texts():
            singulars += self.get_nouns_from_text(text)
        singulars = list(set(singulars))
        return [(singular, plural(singular)) for singular in singulars]

    def get_words(self, text_name):
        self._check_text_availability(text_name)
        return self._get_reader_for(text_name).words(text_name)

    def get_sentences(self, text_name):
        self._check_text_availability(text_name)
        return self._get_reader_for(text_name).sents(text_name)

    def get_tagged_sentences(self, text_name, exclude_punctuation=False):
        for sent in self.get_sentences(text_name):
            if exclude_punctuation:
                sent = [word for word in sent if not is_punctuation(word)]
            yield pos_tag(sent)

    def get_parts_of_speech(self, text_name, exclude_punctuation=False):
        self._log(
            self.INFO,
            'Parts of speech extraction beginning. This might take awhile...')
        pos = set()
        for sent in self.get_tagged_sentences(
                text_name, exclude_punctuation=exclude_punctuation):
            words, parts = zip(*sent)
            pos.update(parts)
        # String blanks (not sure why there are blanks, but there are sometimes).
        return sorted([p for p in pos if p is not ''])

    def get_tag_descriptions(self):
        return tag_descriptions

    def describe_tag(self, tag):
        if tag not in tag_descriptions.keys():
            # Return original tag if we don't know it
            return (tag, tag)
        return tag_descriptions[tag]
class NLTKReader(object):

  ERROR = 0
  WARN = 1
  INFO = 2
  DEBUG = 3

  def __init__(self, input=None, cache_dir='/tmp/nupic_nlp', verbosity=0):
    # Create the cache directory if necessary.
    if not os.path.exists(cache_dir):
      os.mkdir(cache_dir)
    self.cache_dir = cache_dir
    self._verbosity = verbosity
    if input is not None:
      self.input_reader = PlaintextCorpusReader(input, '.*\.txt')
    else:
      self.input_reader = None


  def _log(self, lvl, msg):
    if lvl <= self._verbosity:
      print msg


  def _is_noun(self, word):
    synonyms = len(wn.synsets(word, NOUN))
    self._log(self.DEBUG, 'found %i noun synonyms for %s' % (synonyms, word))
    return synonyms > 0


  def _get_cache_file(self, cache_name):
    return os.path.join(self.cache_dir, cache_name)


  def _write_cache(self, cache_name, data):
    cache_file = self._get_cache_file(cache_name)
    self._log(self.INFO, 'writing cache to %s' % cache_file)
    with open(cache_file, 'w') as f:
      f.write(data)


  def _cache_exists(self, cache_name):
    cache_file = self._get_cache_file(cache_name)
    return os.path.exists(cache_file)


  def _read_cache(self, cache_name):
    cache_file = self._get_cache_file(cache_name)
    self._log(self.INFO, 'reading cache from %s' % cache_file)
    return open(cache_file, 'r').read()


  def _check_text_availability(self, text_name):
    if text_name not in self.available_texts():
      raise Exception('No corpus available named "%s".' % text_name)


  def _get_reader_for(self, text_name):
    if text_name in gutenberg.fileids():
      return gutenberg
    else:
      return self.input_reader


  def available_texts(self):
    available = gutenberg.fileids()
    if self.input_reader is not None:
      available = available + self.input_reader.fileids()
    return available


  def text_report(self):
    print '%40s %10s %10s' % ('text', 'words', 'sentences')
    for txt in self.available_texts():
      word_count = len(self.get_words(txt))
      sent_count = len(self.get_sentences(txt))
      print '%40s %10i %10i' % (txt, word_count, sent_count)


  def get_words_from_text(self, text_name):
    self._check_text_availability(text_name)
    words_with_puncuation = self.get_words(text_name)
    # Strip punctuation and make lower case.
    words = [w.lower()
      for w in words_with_puncuation
      if w not in string.punctuation and len(w) > 3]
    # Remove duplicate nouns.
    words = list(set(words))
    self._log(self.INFO, 'Found %i unique words from %s' % (len(words), text_name))
    return words


  def get_nouns_from_text(self, text_name):
    self._log(self.INFO, '\nGetting nouns from %s' % text_name)
    cache_name = 'nouns_' + text_name
    if self._cache_exists(cache_name):
      nouns = self._read_cache(cache_name).split(',')
    else:
      words = self.get_words_from_text(text_name)
      self._log(self.WARN, 'Noun identification beginning. This might take awhile...')
      self._log(self.INFO, 'Tagging part of speech for %i words...' % len(words))
      tagged_words = pos_tag(words)

      self._log(self.INFO, 'Extracting all non-nouns based on POS tag...')
      nouns = [ word for word, pos in tagged_words if len(word) > 2 and pos == 'NN']
      self._log(self.INFO, '\t%i left' % len(nouns))

      self._log(self.INFO, 'Extracting further non-nouns based on Wordnet synonyms...')
      nouns = [ noun for noun in nouns if self._is_noun(noun) ]
      self._log(self.INFO, '\t%i left' % len(nouns))

      self._write_cache(cache_name, ','.join(nouns))

    self._log(self.INFO, 'Found %i total nouns from %s' \
      % (len(nouns), text_name))
    return nouns


  def get_noun_pairs_from_all_texts(self):
    """Retrieves all nouns from the NLTK corpus of texts."""
    singulars = []
    for text in self.available_texts():
      singulars += self.get_nouns_from_text(text)
    singulars = list(set(singulars))
    return [(singular, plural(singular)) for singular in singulars]


  def get_words(self, text_name):
    self._check_text_availability(text_name)
    return self._get_reader_for(text_name).words(text_name)


  def get_sentences(self, text_name):
    self._check_text_availability(text_name)
    return self._get_reader_for(text_name).sents(text_name)


  def get_tagged_sentences(self, text_name, exclude_punctuation=False):
    for sent in self.get_sentences(text_name):
      if exclude_punctuation:
        sent = [ word for word in sent if not is_punctuation(word) ]
      yield pos_tag(sent)


  def get_parts_of_speech(self, text_name, exclude_punctuation=False):
    self._log(self.INFO, 'Parts of speech extraction beginning. This might take awhile...')
    pos = set()
    for sent in self.get_tagged_sentences(text_name,
                                          exclude_punctuation=exclude_punctuation):
      words, parts = zip(*sent)
      pos.update(parts)
    # String blanks (not sure why there are blanks, but there are sometimes).
    return sorted([ p for p in pos if p is not '' ])


  def get_tag_descriptions(self):
    return tag_descriptions


  def describe_tag(self, tag):
    if tag not in tag_descriptions.keys():
      # Return original tag if we don't know it
      return (tag,tag)
    return tag_descriptions[tag]
示例#9
0
import numpy as np
import nltk
import pandas as pd
from nltk.corpus.reader import PlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer

mycorpus = PlaintextCorpusReader(r"CSI58100TextFiles", r".*\.txt")
vec = CountVectorizer()
indx = 0
lst = []
for i in mycorpus.fileids():
    nlst = mycorpus.raw(i)
    indx = indx + 1
    lst.append(nlst)
corpus = np.array(lst)

#-----------Stop Words---------
vec = CountVectorizer(stop_words="english")
vec.fit(corpus)
#Sparse matrix
X = vec.transform(corpus)
bM = pd.DataFrame(X.toarray(),
                  columns=vec.get_feature_names(),
                  index=mycorpus.fileids()).T
print(type(corpus))
print(corpus)
print(bM)
# bM.to_csv('booleanMatrix.csv')
# # Jaccards similarity
# from sklearn.metrics import jaccard_score
# similarity = []
import os
directory = "D:/Eigene Dateien_rklein/z_Forschung/_Konferenzen/_79_ICFCA - Dresden - Concept Analysis/Data/"
input_directory = directory + "Input/_Product_Management/"
output_directory = directory + "1_POS/"
if not os.path.exists(output_directory): os.mkdir(output_directory)

# reading stuff
file_list = os.listdir(input_directory)
print file_list

# just for testing create a corpus reader
from nltk.corpus.reader import PlaintextCorpusReader
reader = PlaintextCorpusReader(input_directory,'.*.txt')

reader.fileids()
reader.raw()
reader.sents()
reader.words()

## default POS tagger from NLTK ##
import nltk
# import pprint
# sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
pos = "nltk"
path = output_directory + pos
if not os.path.exists(path): os.mkdir(path)
for i in range(len(file_list)):
#    posting = []
    output = path + "/" + str(file_list[i])
    jfile=open (output,"w")
示例#11
0
                x[i, t] = word2idx(w)
            y[i] = word2idx(next_word_list[index % len(next_word_list)])
            index = index + 1
        yield x, y

if __name__ == "__main__":
    directory = 'F:/Minhaz/GitHubRepo/News_Gen/Minhaz_Shahadat/Code/Bengali_Word2Vec_LSTM/'
    corpus_dir = directory + 'corpus/'
    examples = directory + 'examples.txt'
    vocabulary = directory + 'vocab.txt'
    
    w_t = RegexpTokenizer("[\u0980-\u09FF']+")
    corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', word_tokenizer=w_t)
    
    text_in_words = []
    files = corpus.fileids()
    for f in files:    
        words_in_doc = corpus.words(f)
        text_in_words.append(words_in_doc)
    text_in_words = [[re.sub(r'\d+', '<number>', word) for word in document]for document in text_in_words]
    
    words = []
    for doc in text_in_words:
        for word in doc:
            words.append(word)
    words = sorted(set(words))
    print_vocabulary(vocabulary, words)
    
    if not os.path.isdir(directory + 'checkpoints/'):
        os.makedirs(directory + 'checkpoints/')