Python CategorizedPlaintextCorpusReader.fileids 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.corpus.reader.plaintext

클래스/타입: CategorizedPlaintextCorpusReader

메소드/함수: fileids

hotexamples.com에서의 예제들: 10

Python CategorizedPlaintextCorpusReader.fileids - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader.fileids에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

CategorizedPlaintextCorpusReader(14)

fileids(9)

words(8)

categories(7)

__init__(2)

raw(2)

sents(2)

paras(1)

예제 #1

파일 보기

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		for cat in docs.categories():
			self.cat_gram_freq[cat] = {}
			self.cat_word_freq[cat]={}
		return ((category,list(docs.words(fileid))) 
			for category in docs.categories() 
			for fileid in docs.fileids(category))

예제 #2

파일 보기

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		print docs.categories()
		documents = [(list(docs.words(fileid)), category)
				for category in docs.categories()
				for fileid in docs.fileids(category)
		]
		random.shuffle(documents)
		return documents

예제 #3

파일 보기

import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('ENGLISH',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids()[100:110])
print(corpus.words())

예제 #4

파일 보기

파일: bill_bayes.py 프로젝트: PamelaM/billy

                        default=False, help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(
        root=args.directory,
        fileids=".*/.*\.txt",
        cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

    train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff]
    test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:]
    print 'training on %d instances, testing on %d instances' % (
        len(train_feats), len(test_feats))

예제 #5

파일 보기

                        default=False,
                        help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(root=args.directory,
                                              fileids=".*/.*\.txt",
                                              cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

    train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff]
    test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:]
    print 'training on %d instances, testing on %d instances' % (
        len(train_feats), len(test_feats))

예제 #6

파일 보기

파일: Chapter 5 NLP NLU NLG.py 프로젝트: zgj0607/building-an-enterprise-chatbot

#Load Libraries

import os
import random
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

#Read the dataset into the categorized corpus

# Directory of the corpus
corpusdir = 'corpus/' 
review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt')

# list of documents(fileid) and category (pos/neg)
documents = [(list(review_corpus.words(fileid)), category)
              for category in review_corpus.categories()
              for fileid in review_corpus.fileids(category)]
random.shuffle(documents)

for category in review_corpus.categories():
    print(category)

type(review_corpus)

len(documents)

#Compute word frequency

import nltk
all_words = nltk.FreqDist(w.lower() for w in review_corpus.words())
word_features = list(all_words)[:200]

예제 #7

파일 보기

                    label=target_name)
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title('PCA of BULATS dataset')
    plt.show()

    return model


if __name__ == "__main__":
    PATH = "model.pickle"
    # Loading speech features
    speech = pd.read_csv("/ExamplePath.csv")

    if not os.path.exists(PATH):
        nli = CategorizedPlaintextCorpusReader(CORPUS,
                                               DOC_PATTERN,
                                               cat_pattern=CAT_PATTERN)
        # since `nli` already has all the information (text and ids)
        # you don't need to iterate over it multiple times so
        # construct `X` and `y` in one go.
        X = []
        y = []
        for fileid in nli.fileids():
            X.append({
                'text': nli.raw(fileid),
                'id': fileid.split('/')[-1].split('.')[0]
            })
            y.append(nli.categories(fileid)[0])
        clf = PCA(n_components=2)
        model = build_and_evaluate(X, y, clf, speech)

예제 #8

파일 보기

파일: text_corpus.py 프로젝트: EvgenyNovashov/AnalizML

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('corpus/text',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids('2019'))

예제 #9

파일 보기

파일: score.py 프로젝트: pavelsavov/paper-scores

        return 1 in [c in str for c in set]

    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    doc_lowercase = [w.lower() for w in doc]
    return lemma.lemmatize([w for w in doc_lowercase
                            if not (is_number(w)) and len(w) > 1 and contains_any(w, wordchars)
                            and not contains_any(w, exclude) and w not in stop])


doc_dict = {fid: clean(corpus.words(fid)) for cat in corpus.categories() for fid in corpus.fileids(cat)}  # XXX

docs = doc_dict.values()
dictionary = gensim.corpora.Dictionary(docs)

doc_ids = [k for k in doc_dict.keys()]
doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]
bow_array = np.array(doc_term_matrix)


def find_best_lda_model(texts, bow, id2word, min_n=min_topics, max_n=max_topics):
    best_model = None
    max_coherence = -1
    for n in range(min_n, max_n + 1):
        ctm = CtmModel(
            bow, id2word=id2word, num_topics=n)

예제 #10

파일 보기

# create a corpus from the txt files given, with a file of categories to apply to the texts
corpus = CategorizedPlaintextCorpusReader(
                           'corpus/', 
                           r'.*\.txt',
                           cat_file="../textcats.prn")
"""
fileid="nytimes-2017.txt"
raw = corpus.raw(fileid)
raw = raw.replace("N.H.S.", "NHS")
words = word_tokenize(raw)
words = corpus.words(fileid)
clean0 = [word for word in words if word not in stoplist]
"""

bloblist = corpus.fileids()
#bloblist = corpus.fileids(categories='2016')
M=len(bloblist)
# Look at the categories
corpus.categories()

    
# for each file in the corpus

for fileid in bloblist:
    raw = corpus.raw(fileid)
    raw = raw.replace("N.H.S.", "NHS")
    raw = raw.replace("per cent", "%")
    raw = raw.replace("votes", "vote")
    raw = raw.replace("voted", "vote")
    words = word_tokenize(raw)