Exemplos de CategorizedPlaintextCorpusReader em Python, exemplos de nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: reader.py Projeto: SavinaRoja/gitPLoS

    def __init__(self, root, **kwargs):
        """
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

        @type  root: string
	@param root: The directory path to the corpus directory.
        """
        self._root = root
        fp = open( '%s/corpus_info.json' % (root), 'r' )
        self._corpus_info = info = json.load(fp)
        fp.close()

        # doc_part is specific to PLoS and research article in general.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to readi only one.
	if 'doc_part' in kwargs:
            self._doc_part = doc_part = kwargs['doc_part']
	    del kwargs['doc_part']
	else:
	    self._doc_part = doc_part = 'body'
	if 'fileids' not in kwargs:
            fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] 
        else:
	    fileids =  kwargs['fileids']
        # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
	cat_map = {}
        for d,cat in info['d2c'].iteritems():
            cat_map[doi2fn(d, doc_part)] = cat

	kwargs['cat_map'] = cat_map
	# Subclass of Categorized Plaintext Corpus Reader
        CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: autotext.py Projeto: jorgegus/autotext

 def _extract_meta_data(self, corpus_path):
     self.db = CategorizedPlaintextCorpusReader(corpus_path,
                                                r'.*\.txt',
                                                cat_pattern=r'(\w+)/*')
     new_corpus = Corpus(self.db, corpus_path, self.limit_memory,
                         self.verbose)
     return new_corpus.get_meta_features()

Exemplo n.º 3

0

Exibir arquivo

    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())

Exemplo n.º 4

0

Exibir arquivo

Arquivo: autotext.py Projeto: jorgegus/autotext

 def predict(self, test_path):
     documents, self.y_test = self._read_corpus(
         CategorizedPlaintextCorpusReader(test_path,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*'),
         test_path)
     self.X_test = self.representation.transform(documents)
     return self.automl.predict(self.X_test)

Exemplo n.º 5

0

Exibir arquivo

  def __init__(self, root, **kwargs):
    """ 
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

    @type  root: string
	@param root: The directory path to the corpus.
    """
    self._root = root
    
    # corpus type is specific to Plos_builder
    # full - all documents that were built.
    # partial - documents excluding training 
    # training - documents intended for training
    if 'corpus_type' in kwargs:
      self._corpus_type = kwargs['corpus_type']
      del kwargs['corpus_type']
    else:
      self._corpus_type = 'full'
    
    fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type)
    with open( fn, 'r' ) as fp:
      self._corpus_info = info = json.load(fp)

    # doc_part is specific to PLoS and research article.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to read only one.
    if 'doc_part' in kwargs:
      self._doc_part = doc_part = kwargs['doc_part']
      del kwargs['doc_part']
    else:
      self._doc_part = doc_part = 'body'
    
    if 'fileids' not in kwargs:
      fileids = [ doi2fn(d, doc_part) for d in self.dois() ] 
    else:
	    fileids =  kwargs['fileids']
    # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
    kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() }
	  # Subclass of Categorized Plaintext Corpus Reader
    CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)

Exemplo n.º 6

0

Exibir arquivo

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		print docs.categories()
		documents = [(list(docs.words(fileid)), category)
				for category in docs.categories()
				for fileid in docs.fileids(category)
		]
		random.shuffle(documents)
		return documents

Exemplo n.º 7

0

Exibir arquivo

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		for cat in docs.categories():
			self.cat_gram_freq[cat] = {}
			self.cat_word_freq[cat]={}
		return ((category,list(docs.words(fileid))) 
			for category in docs.categories() 
			for fileid in docs.fileids(category))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: bill_bayes.py Projeto: PamelaM/billy

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('directory',
                        help="the bill directory")
    parser.add_argument('--bigrams', action='store_true', dest='bigrams',
                        default=False, help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(
        root=args.directory,
        fileids=".*/.*\.txt",
        cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

Exemplo n.º 9

0

Exibir arquivo

import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('ENGLISH',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids()[100:110])
print(corpus.words())

Exemplo n.º 10

0

Exibir arquivo

Arquivo: Chapter 5 NLP NLU NLG.py Projeto: zgj0607/building-an-enterprise-chatbot

classifier.show_most_informative_features(5)

#Document Classification

#Load Libraries

import os
import random
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

#Read the dataset into the categorized corpus

# Directory of the corpus
corpusdir = 'corpus/' 
review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt')

# list of documents(fileid) and category (pos/neg)
documents = [(list(review_corpus.words(fileid)), category)
              for category in review_corpus.categories()
              for fileid in review_corpus.fileids(category)]
random.shuffle(documents)

for category in review_corpus.categories():
    print(category)

type(review_corpus)

len(documents)

#Compute word frequency

Exemplo n.º 11

0

Exibir arquivo

    parser = argparse.ArgumentParser()
    parser.add_argument('directory', help="the bill directory")
    parser.add_argument('--bigrams',
                        action='store_true',
                        dest='bigrams',
                        default=False,
                        help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(root=args.directory,
                                              fileids=".*/.*\.txt",
                                              cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_nltk.py Projeto: RandomJungle/TextCat

import os
import nltk
import re
from math import log
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.text import Text
from nltk import ConditionalFreqDist, FreqDist
from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()
stopwordsdir = "C:/Projects/Allocine/stopwords/used"
stopwords = []
root = "C:/Projects/Allocine/corpus2/"
cats = ['cine', 'autre', 'critique', 'critique_a']
reader = CategorizedPlaintextCorpusReader(root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin-1')

text_all = Text(reader.words())
text_cine = Text(reader.words(categories='cine'))
text_autre = Text(reader.words(categories='autre'))
text_critique = Text(reader.words(categories='critique'))
text_critique_a = Text(reader.words(categories='critique_a'))
texts_list = [text_cine, text_autre, text_critique, text_critique_a]


def remove_accents(text):
    text = re.sub("[àâäÄÂÀ]", "a", text)
    text = re.sub("[éèêëÈÊËÉ]", "e", text)
    text = re.sub("[ïîìÏÎÌ]", "i", text)
    text = re.sub("[öôòÖÔÒ]", "o", text)

Exemplo n.º 13

0

Exibir arquivo


# fileids_ = corpus_dir + '/rt-polarity*'

corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata'

cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']}

corpus_treatment(corpus_dir)

encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data')
fileids_ = '^rt-polarity.*'

categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader(
    root=encoded_corpus_dir,
    cat_map=t_map_,
    fileids=fileids_,
)

pos_words = categorized_plaintext_corpusreader.words(categories=['pos'])
pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos'])
pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos'])

neg_words = categorized_plaintext_corpusreader.words(categories=['pos'])
neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg'])
neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg'])

# NOTE: para views are not working to be looked into later

# classification
train = pos_words

Exemplo n.º 14

0

Exibir arquivo

Arquivo: count_ngrams.py Projeto: tehf0x/gabe-and-joh

import sys
import cPickle as pickle

from itertools import chain

# from nltk import trigrams, word_tokenize, sent_tokenize, FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.util import ingrams

n = 3

train_path = "data/task1_train"

print "Loading categorized corpus in", train_path, "..."

cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)")

# Get categories
print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories()))

for c in [cr.categories()[0]]:
    print c + "..."
    sys.stdout.flush()

    ngrams = {}
    for i in range(n, 0, -1):
        print str(i) + "-grams..."
        ngrams[i] = {}
        prefix = ("",) * (i - 1)
        for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n):
            if not ngram in ngrams[i]:

Exemplo n.º 15

0

Exibir arquivo

             if type(i) == Tree:
                     current_chunk.append(" ".join([token for token, pos in i.leaves()]))
             elif current_chunk:
                     named_entity = " ".join(current_chunk)
                     if named_entity not in continuous_chunk:
                             continuous_chunk.append(named_entity)
                             current_chunk = []
             else:
                     continue
     return continuous_chunk



# create a corpus from the txt files given, with a file of categories to apply to the texts
corpus = CategorizedPlaintextCorpusReader(
                           'corpus/', 
                           r'.*\.txt',
                           cat_file="../textcats.prn")
"""
fileid="nytimes-2017.txt"
raw = corpus.raw(fileid)
raw = raw.replace("N.H.S.", "NHS")
words = word_tokenize(raw)
words = corpus.words(fileid)
clean0 = [word for word in words if word not in stoplist]
"""

bloblist = corpus.fileids()
#bloblist = corpus.fileids(categories='2016')
M=len(bloblist)
# Look at the categories
corpus.categories()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: benchmark_ngram.py Projeto: okkhoy/gabe-and-joh

import sys
from time import time
from itertools import chain
import cPickle as pickle


from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
from nltk.util import ingrams


print 'Loading corpus...',
t = time()

train_path = 'data/task1_train'
cr = CategorizedPlaintextCorpusReader(train_path, '.*', cat_pattern='(\w*)')

t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
sys.stdout.flush()
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: ner.py Projeto: TinaCloud/atap

import os
import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

path = os.path.join(os.getcwd(), "debates")

DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(path,
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)


def tag_corpus(corpus):
    return [nltk.pos_tag(sent) for sent in corpus.sents()]


tagged_corpus = tag_corpus(corpus)

import spacy

nlp = spacy.load('en')


def spacy_ner(tokenized_sent):
    doc = nlp(' '.join(tokenized_sent))
    for ent in doc.ents:
        return ent.text, ent.label_

Exemplo n.º 18

0

Exibir arquivo

def create_corpus(directory):
    word_tokenize = RegexpTokenizer('\w+(?:-\w+)*(?:[?!.,:])*')
    sent_tokenize = nltk.data.load('tokenizers/punkt/french.pickle')
    translation = str.maketrans("", "", ",.?!:")
    corpus = CategorizedPlaintextCorpusReader(directory, r"^[^.]*$", cat_file='cats.txt', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize)
    return corpus

Exemplo n.º 19

0

Exibir arquivo

                    label=target_name)
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title('PCA of BULATS dataset')
    plt.show()

    return model


if __name__ == "__main__":
    PATH = "model.pickle"
    # Loading speech features
    speech = pd.read_csv("/ExamplePath.csv")

    if not os.path.exists(PATH):
        nli = CategorizedPlaintextCorpusReader(CORPUS,
                                               DOC_PATTERN,
                                               cat_pattern=CAT_PATTERN)
        # since `nli` already has all the information (text and ids)
        # you don't need to iterate over it multiple times so
        # construct `X` and `y` in one go.
        X = []
        y = []
        for fileid in nli.fileids():
            X.append({
                'text': nli.raw(fileid),
                'id': fileid.split('/')[-1].split('.')[0]
            })
            y.append(nli.categories(fileid)[0])
        clf = PCA(n_components=2)
        model = build_and_evaluate(X, y, clf, speech)

Exemplo n.º 20

0

Exibir arquivo

import os
import re
import csv

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

srm_data_dir = '/home/mayank/work/projects/SRM/Data'

training_file_path = os.path.join(srm_data_dir, 'TrainingData.csv')
training_file = open(training_file_path, 'r')

root_dir = os.path.join(srm_data_dir, 'sub_data')

# noraml_reader = PlaintextCorpusReader(root = root_dir,
#                                       fileids = ['Financial.csv'])

cat_map_ = {
    'Compliance': 'Compliance.csv',
    'Financial': 'Financial.csv',
    'Operational': 'Operational.csv',
    'Strategic': 'Strategic.csv'
}

cat_reader = CategorizedPlaintextCorpusReader(root=root_dir,
                                              fileids=r'$.csv',
                                              cat_map=cat_map_)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: score.py Projeto: pavelsavov/paper-scores

import re
from sklearn import metrics
import string
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from poslemma import LemmatizationWithPOSTagger
from ordinal_classification import OrdinalSVC
from sklearn.model_selection import StratifiedKFold

base_dir = '/Users/ja/Documents/www'
corpus_name = 'corpus'

min_topics = 10
max_topics = 60

corpus = CategorizedPlaintextCorpusReader(
    os.path.join(base_dir, corpus_name), fileids=r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*')


def clean(doc):
    lemma = LemmatizationWithPOSTagger()
    stop = set(stopwords.words('english') + stopwords.words('numbers'))
    exclude = set(string.punctuation)
    wordchars = set(string.ascii_letters)
    wordchars |= set(string.digits)

    def contains_any(str, set):
        """Check whether 'str' contains ANY of the chars in 'set'"""
        return 1 in [c in str for c in set]

    def is_number(s):
        try:

Exemplo n.º 22

0

Exibir arquivo

Arquivo: text_corpus.py Projeto: EvgenyNovashov/AnalizML

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('corpus/text',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids('2019'))