from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords stopwordlist=stopwords.words('german') from wordcloud import WordCloud rootDir="../01access/GERMAN" filepattern=r"(?!\.)[\w_]+(/RSS/FeedText/)[\w-]+/[\w-]+\.txt" #filepattern=r"(?!\.)[\w_]+(/RSS/FullText/)[\w-]+/[\w-]+\.txt" catpattern=r"([\w_]+)/.*" rssreader=CategorizedPlaintextCorpusReader(rootDir,filepattern,cat_pattern=catpattern) # In[3]: singleDoc=rssreader.paras(categories="TECH")[0] print("The first paragraph:\n",singleDoc) print("Number of paragraphs in the corpus: ",len(rssreader.paras(categories="TECH"))) # In[4]: techdocs=[[w.lower() for sent in singleDoc for w in sent if (len(w)>1 and w.lower() not in stopwordlist)] for singleDoc in rssreader.paras(categories="TECH")] print("Number of documents in category Tech: ",len(techdocs)) # In[5]: generaldocs=[[w.lower() for sent in singleDoc for w in sent if (len(w)>1 and w.lower() not in stopwordlist)] for singleDoc in rssreader.paras(categories="GENERAL")]