示例#1
0
 def extractDataAndRunExperiment(self):
     print("Extracting data from NLTK corpus.")
     category_pairs = parameters.reuters_category_pairs
     for pair in category_pairs:  #do the experiment on selected pairs
         #Extract data for corresponding categories
         spam_category = pair[0]
         ham_category = pair[1]
         print("Experiment pair: " + str(spam_category) + ", " +
               str(ham_category))
         self.dataset_name = str(spam_category) + ", " + str(ham_category)
         spam_doc_ids = reuters.fileids(
             spam_category)  #acquire documents using all docids for spam
         ham_doc_ids = reuters.fileids(
             ham_category)  #acquire documents using all docids for ham
         spamlist = []  #dataset for spam words
         hamlist = []  #dataset for ham words
         for docid in spam_doc_ids:
             wordlist = reuters.words(docid)  #get the words for document
             spamlist.append(
                 wordlist)  #append the documents words to spamlist as a row
         for docid in ham_doc_ids:
             wordlist = reuters.words(docid)  #get the words for document
             hamlist.append(
                 wordlist)  #append the documents words to hamlist as a row
         #preprocess extracted data and merge them into self.dataset
         self.preprocessDocuments(
             spamlist, 1
         )  #will tokenize (remove stopwords, punct.etc.), insert labels and dictionarize
         self.preprocessDocuments(
             hamlist, 0
         )  #will tokenize (remove stopwords, punct.etc.), insert labels and dictionarize
         #call generate training test and experiment
         self.runExperiment()
示例#2
0
def find_suitable_text():
    """Find suitable text for background with checking length of Gutenberg texts,
    Brown categories and Reuters categories. Texts or categories over 50k words are marked green.
    Total length of the corpus is at the top as header"""
    print('\033[95m')
    print("--------------- Gutenberg ---------------")
    print("Total Length: ", len(gutenberg.words()))
    print('\033[0m')
    for fid in gutenberg.fileids():
        words = gutenberg.words(fid)
        length = len(words)
        if length > 50000:
            print('\033[92m')
        print("Text: ", fid)
        print("Length: ", length)
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")

    # brown texts are too short, therefore check categories
    print('\033[95m')
    print("--------------- Brown ---------------")
    print("Total Length: ", len(brown.words()))
    print('\033[0m')
    for cat in brown.categories():
        words = brown.words(categories=cat)
        length = len(words)
        if length > 50000:
            print('\033[92m')
        print("Text category: ", cat)
        print("Length: ", length)
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")

    # reuters texts are too short, therefore check categories
    # reuters actually has some funny categories
    # reuters categories are rather small, however the total corpus is quire large
    print('\033[95m')
    print("--------------- Reuters ---------------")
    print("Total Length: ", len(reuters.words()))
    print('\033[0m')
    for cat in reuters.categories():
        words = reuters.words(categories=[cat])
        if length > 50000:
            print('\033[92m')
        print("Text category: ", cat)
        print("Length: ", len(words))
        print("Content preview: ", words[:20])
        if length > 50000:
            print('\033[0m')
        else:
            print("")
示例#3
0
def get_train_test_docs():
    test_docs, train_docs = get_train_test_splits()
    # print(test_docs)
    test_doc = []
    train_doc = []
    for doc in test_docs:
        test_doc.extend(reuters.words(doc))
    for doc in train_docs:
        train_doc.extend(reuters.words(doc))
    return test_doc, train_doc
示例#4
0
 def __init__(self) -> None:
     global basewords
     self.nltkwords = dict()
     self.fdist = nltk.FreqDist([w.lower() for w in reuters.words()])
     total_samples = sum(self.fdist.values())
     for e in self.fdist.elements():
         self.nltkwords[e] = self.fdist.freq(e) * total_samples
     self.N = len(list(self.nltkwords.keys()))
     self.M = sum(self.nltkwords.values())
     self.media = self.M / self.N
     self.bigrm = list(nltk.bigrams(reuters.words()))
     basewords = self.nltkwords
    def __init__(self)-> None:

        with open("reuters.json","r",encoding="utf8") as file:  
        basewords = json.load(file)

        global basewords
        self.nltkwords=dict()
        self.fdist = nltk.FreqDist([w.lower() for w in reuters.words()])      
        total_samples = sum(self.fdist.values())
        for e in self.fdist.elements():
            self.nltkwords[e]=  self.fdist.freq(e)*total_samples
        self.N = len(list(self.nltkwords.keys()))
        self.M = sum(self.nltkwords.values())
        self.media = self.M/self.N
        self.bigrm = list(nltk.bigrams(reuters.words()))
        basewords =  self.nltkwords
        
        


    def estimate(self,word): 
        stemmer = PorterStemmer()
        ts1 = time.time()
       
        tmp=list()
        nword=len(word.split())
        for i, w in enumerate(word.split()):
          if (w.strip() in self.nltkwords):
              if w.strip() in stops:
                #   if (i>0 and (word.split()[i-1],word.split()[i]) in self.bigrm):
                #         tmp.append(self.media*(1+0.5))                
                #   else:      
                       tmp.append(self.nltkwords[w]/self.media)              
              else: 
                #   if (i>0 and (word.split()[i-1],word.split()[i]) in self.bigrm):  
                #         tmp.append(self.nltkwords[w] *(len(w)-2)*(1 + 0.5))                
                #   else:
                      stem = stemmer.stem(w)
                      if (stem != w and stem in self.nltkwords ):
                        tmp.append((self.nltkwords[w]+self.nltkwords[stem]) *(len(w)-2)) 
                      else:                       
                        tmp.append(self.nltkwords[w] *(len(w)-2))   
          else:          
             tmp.append(-self.media)    
        ts2 = time.time()    
     
        if (len(tmp)>0):
            atmp = np.asarray(tmp)
            #atmp = np.exp(atmp) / np.sum(np.exp(atmp))
            return np.nansum(atmp)/nword
        else:
             return 0
示例#6
0
def compute_cosine(code_doc_1,code_doc_2):
	'''
		Calcul la similarité entre deux documents à partir de l'angle entre deux documents
		Input:
			code_doc_1 : code du document, qui correspond à son code dans la liste documents
			code_doc_2 : document 2
		Output:
			Le cosinus entre les deux docs
	'''
	text_1,_,_ = st.tokenize_sentences([pre_processing(list(reuters.words(code_doc_1)))])
	text_2,_,_ = st.tokenize_sentences([pre_processing(list(reuters.words(code_doc_2)))])
	pred_1 = model.predict(text_1)
	pred_2 = model.predict(text_2)
	return(cosine(pred_1,pred_2),text_1)
示例#7
0
def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()
示例#8
0
def collection_stats():
    # 文档列表
    documents = reuters.fileids()
    print(len(documents), "篇文档")

    train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
    print(str(len(train_docs)) + "篇训练文档")

    test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
    print(str(len(test_docs)) + "篇测试文档")

    # 类别列表
    categories = reuters.categories()
    print(str(len(categories)) + "个类别")

    # 一个类别中的文档
    category_docs = reuters.fileids("acq")

    # 文档单词
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print("文档单词:\n", document_words)

    # 原始文档
    print("原始文档:\n", reuters.raw(document_id))
示例#9
0
def getWeightedQuestionKeywords(query):
    browntext = brown.words()
    browndist = nltk.FreqDist(browntext)

    reuterstext = reuters.words()
    reutersdist = nltk.FreqDist(reuterstext)

    text = nltk.word_tokenize(query)
    tagged = nltk.pos_tag(text)

    filteredparts = []
    for pair in tagged:
        if pair[1] in [
                'FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP',
                'NPS', 'NR', 'RB', 'RBR', 'RBT'
                'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD'
        ]:
            filteredparts.append(pair)

    filtereddist = {}
    for pair in filteredparts:
        frequency = browndist[pair[0]] + reutersdist[pair[0]]
        if frequency < 600 or (pair[1]
                               in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] and
                               (frequency < 1500)):  # more types here?
            filtereddist[pair[0]] = (frequency, pair[1])
    return filtereddist  # ensure there are at least a certain number of keywords? add stemming or synonyms? add phrases?
示例#10
0
 def __init__(self) -> None:
     with open("reuters.json", "r", encoding="utf8") as file:
         self.nltkwords = json.load(file)
     self.N = len(list(self.nltkwords.keys()))
     self.M = sum(self.nltkwords.values())
     self.media = self.M / self.N
     self.bigrm = list(nltk.bigrams(reuters.words()))
示例#11
0
def simple_results(query, added_vocab=None):
    wnl = WordNetLemmatizer()
    invertedindex = pickle.load(open("invertedindex_test.pkl", "rb"))
    l = []
    flag = 0
    zero_result = 0
    words_used = []
    for i, j in pos_tag(word_tokenize(query.lower())):
        if j[0].lower() in ['a', 'n', 'v']:
            q = wnl.lemmatize(i, j[0].lower())
        else:
            q = wnl.lemmatize(i)
        if q not in invertedindex:
            words_used = []
            zero_result = 1
            break
        if q not in reuters.words("stopwords"):
            if flag == 0:
                l = list(invertedindex[q].keys())
                words_used.append(q)
            else:
                l1 = [
                    value for value in l
                    if value in list(invertedindex[q].keys())
                ]
                l = l1
                if len(l) == 0:
                    if added_vocab is not None:
                        words_used = []
                        zero_result = 1
                        break
                    else:
                        return l, words_used
            flag = 1
    if added_vocab is not None:
        if zero_result or len(l) < 5:
            for v in added_vocab:
                try:
                    if v[1].isalpha() and len(list(
                            invertedindex[v[1]].keys())) > 0:
                        return list(invertedindex[
                            v[1]].keys()) + l, words_used + [v[1]]
                except:
                    pass
        for v in added_vocab:
            if v[1].isalpha() == False or v[1] not in invertedindex.keys():
                continue
            if len(l) > 20:
                l1 = [
                    value for value in l
                    if value in list(invertedindex[v[1]].keys())
                ]
                if len(l1) < 20:
                    break
                l = l1
                words_used.append(v[1])
            else:
                return l, words_used
        return l, words_used
    return l, words_used
示例#12
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
示例#13
0
def getKeywordsDraft(query):
    browntext = brown.words()
    browndist = nltk.FreqDist(browntext)

    reuterstext = reuters.words()
    reutersdist = nltk.FreqDist(reuterstext)

    text = nltk.word_tokenize(query)

    tagged = nltk.pos_tag(text)

    print(tagged)

    filteredparts = []
    for pair in tagged:
        if pair[1] in ['FW', 'JJ', 'JJR', 'JJS', 'JJT', 'N', 'NN', 'NNP', 'NNS', 'NP', 'NPS', 'NR', 'RB', 'RBR', 'RBT' 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NUM', 'CD', 'OD']:
            filteredparts.append(pair[0])
    print(filteredparts)
    filtereddist = {}
    for word in filteredparts:
        frequency = browndist[word] + reutersdist[word]
        print word
        print frequency
        if frequency < 600:
            filtereddist[word] = frequency
    sortedlist = sorted(filtereddist.items(), key=itemgetter(1))
    print(sortedlist)
    return sortedlist
示例#14
0
def main():
    # データ
    k = 100
    docs = [corpus.words(fileid) for fileid in corpus.fileids()[:k]]

    # 前処理
    pp_docs = preprocess_documents(docs)

    # ベクトル化
    vectorizer = TfidfVectorizer(max_features=50,
                                 token_pattern=u'(?u)\\b\\w+\\b')
    tf_idf = vectorizer.fit_transform(pp_docs)

    # K-means
    num_clusters = 8
    km = KMeans(n_clusters=num_clusters,
                init='k-means++',
                max_iter=300,
                random_state=0,
                precompute_distances=True)

    clusters = km.fit_predict(tf_idf)
    categories = [
        ','.join(corpus.categories(fileid)) for fileid in corpus.fileids()[:k]
    ]
    keys = []
    for k, _ in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]):
        keys.append(k)

    w_df = pd.DataFrame({'class': clusters, 'category': categories})
    k_df = pd.DataFrame({'key': keys})
    print(k_df)
    print(w_df)
    w_df.to_csv('result/kmeans_' + str(num_clusters) + '.csv')
    k_df.to_csv('result/tf_idf_key_' + str(num_clusters) + '.csv')
def reuters_dataframe(n=9160):
    def clean(words):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        words_lower = [w.lower() for w in words]
        return [w for w in words_lower if w not in stopwords and len(w) >= 3]

    def title(words):
        return words[:20]

    fileids = [i for i in reuters.fileids() if len(reuters.categories(i)) == 1][:n]
    df = pd.DataFrame({'text': [' '.join(clean(reuters.words(i))) for i in fileids],
                       'category': [reuters.categories(i)[0] for i in fileids],
                       'title': [' '.join(title(reuters.words(i))) for i in fileids],
                       'fileids': fileids,
                       'words': [reuters.words(i) for i in fileids]})
    return df
def load_english_frequencies():
    nltk.download(['brown', 'gutenberg', 'reuters'])

    global english_frequencies
    english_frequencies = FreqDist(w.lower() for w in brown.words())
    english_frequencies.update(w.lower() for w in gutenberg.words())
    english_frequencies.update(w.lower() for w in reuters.words())
 def compute_word_frequencies(self):
     nltk.download('brown')
     nltk.download('reuters')
     counter = Counter()
     counter.update(self.normalize_words(brown.words()))
     counter.update(self.normalize_words(reuters.words()))
     return counter
def retrieve_reuters_documents(max_documents=-1, filter_words=True):
    # List of documents
    documents = []

    training_files = [
        file_id for file_id in reuters.fileids() if 'training/' in file_id
    ]
    test_files = [
        file_id for file_id in reuters.fileids() if 'test/' in file_id
    ]

    for file_id in interleave([training_files, test_files]):
        if max_documents > -1 and len(documents) >= max_documents:
            return documents

        words = list(reuters.words(fileids=file_id))
        words_filtered = do_filter_words(words)
        document = {
            'words': words,
            'title': reuters.raw(fileids=file_id).split("\n")[0],
            'categories': reuters.categories(fileids=file_id),
            'is_training_example': True if 'training/' in file_id else False,
            'is_test_example': True if 'test/' in file_id else False,
            'words_filtered': words_filtered if filter_words else words,
            'file_id': file_id
        }
        if len(words_filtered) < 30:
            continue
        documents.append(document)

    return documents
示例#19
0
    def __init__(self, category, index, m, k, contigous=True, blob_length=100000):
        '''
            :param category: the name of the document's category
            :param m: the number of top features
            :param n: the length of each feature
            :param index: the index of the document into the Reuters data-set
            :param contigous: Boolean, True if features are contigous, True otherwise 
        '''

        self.m = m
        self.k = k
        self.index = index
        self.category = category
        self.contigous = contigous
        self.features = set()
        self.words = reuters.words(index)
        self.clean_data = self.remove_stops()

        if not self.contigous:
            self.blob_length = min(blob_length, len(self.words))
            self.noncont_features = defaultdict(lambda: {'count':0, 'weights':[]}, {})
            self.words = self.words[:self.blob_length]

        self.set_features()
        self.m = min(self.m, len(self.features))
        self.sort_features()
示例#20
0
def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
    print(str(len(train_docs)) + " total train documents")

    test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
    print(str(len(test_docs)) + " total test documents")

    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # Documents in a category
    category_docs = reuters.fileids("acq")

    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # Raw document
    print(reuters.raw(document_id))
示例#21
0
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
    labeled_words = []

    for label in reuters.categories():
        labeled_words.append((label, reuters.words(categories=[label])))

    return high_information_words(labeled_words, score_fn=score_fn)
示例#22
0
def collection_stats():
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
    print(str(len(train_docs)) + " total train documents")

    test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
    print(str(len(test_docs)) + " total test documents")

    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # Documents in a category
    category_docs = reuters.fileids("acq")

    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # Raw document
    print(reuters.raw(document_id))
    tokens = []
    for docid in train_docs:
        t = tokenize(reuters.raw(docid))
        tokens.extend(t)
    print(tokens[0])
    v = set(tokens)
    print("number of terms=", len(tokens))
    print("voc size=", len(v))
示例#23
0
def get_reuters_ids_cnt(num_doc=100, max_voca=10000, remove_top_n=5):
    """To get test data for training a model
    reuters, stopwords, english words corpora should be installed in nltk_data: nltk.download()

    Parameters
    ----------
    num_doc: int
        number of documents to be returned
    max_voca: int
        maximum number of vocabulary size for the returned corpus
    remove_top_n: int
        remove top n frequently used words

    Returns
    -------
    voca_list: ndarray
        list of vocabulary used to construct a corpus
    doc_ids: list
        list of list of word id for each document
    doc_cnt: list
        list of list of word count for each document
    """
    file_list = reuters.fileids()
    corpus = [reuters.words(file_list[i]) for i in xrange(num_doc)]

    return get_ids_cnt(corpus, max_voca, remove_top_n)
示例#24
0
def generateKnownPatterns():
    from nltk.corpus import brown, reuters, words, wordnet
    from string import ascii_lowercase as ALPH

    patterns = {}

    wordlist = sorted(
        set([x.lower()
             for x in brown.words()] + [x.lower() for x in reuters.words()] +
            [x.lower() for x in words.words()] +
            [x.lower() for x in wordnet.all_lemma_names()]))
    for word in list(wordlist):
        if any(x not in ALPH for x in word):
            wordlist.remove(word)
    with open("static/txt/wordlist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(wordlist))

    for word in wordlist:
        p = pattern(word)
        if p in patterns:
            patterns[p].append(word)
        else:
            patterns[p] = [word]

    with open("static/txt/patterns.json", "w", encoding="utf-8") as f:
        json.dump(patterns, f)
示例#25
0
def get_word_features(category):
    """"Returns list of words that occur in all the documensts
    in specified category"""
    #print "Getting word features.."
    all_words = nltk.FreqDist(w.lower()
                              for w in reuters.words(categories=category))
    word_features = list(all_words)[:5000]
    return word_features
示例#26
0
 def build(self, vocab_size=5000):
     words = reuters.words()
     words = [self.trim(w) for w in words]
     words = [w for w in words if w]
     freq = Counter(words)
     freq = freq.most_common(vocab_size)
     self.vocab = [w_c[0] for w_c in freq]
     self.vocab = [self.PAD, self.UNK] + self.vocab
示例#27
0
def main(categories, document_ids, verbose=False):
    print("categories: {}".format(categories))
    print("number of categories: {}".format(len(categories)))
    cat2catid = {}
    for catid, cat in enumerate(sorted(categories)):
        cat2catid[cat] = catid

    documents = document_ids
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]
    print("train documents: {}".format(len(train)))
    print("test documents: {}".format(len(test)))

    # make it easy to map data to label
    # gather simple statistics
    id2cats = defaultdict(list)
    cat2count = {}
    for cat in categories:
        for fid in reuters.fileids(cat):
            id2cats[fid].append(cat)
            if cat not in cat2count:
                cat2count[cat] = {'train': 0, 'test': 0, 'words': []}
            if fid in train:
                cat2count[cat]['train'] += 1
            else:
                cat2count[cat]['test'] += 1
            cat2count[cat]['words'].append(len(reuters.words(fid)))

    print("How many labels do documents usually have?")
    labelcount2doccount = defaultdict(int)
    for _, cats in id2cats.items():
        labelcount2doccount[len(cats)] += 1
    s = sorted(labelcount2doccount.items(), reverse=True, key=lambda n: n[1])
    for labelcount, documentcount in s:
        print("\tlabelcount={:>3}, documentcount={:>3}".format(
            labelcount, documentcount))

    # Analyze data distribution to classes
    analyze_data_distribution(cat2count)

    # Build corpus
    corpus = []
    for document_id in train:
        corpus += list(reuters.words(document_id))

    analyze_vocabulary(corpus)
示例#28
0
 def docs_to_matrix(self, docs, seq_size):
     docs_i = []
     for d in docs:
         words = reuters.words(d)
         words = self.sentence_to_ids(words, seq_size)
         docs_i.append(words)
     docs_i = np.array(docs_i)
     return docs_i
示例#29
0
def create_dictionary(data, corpus):
    # for p in data.split('\r\n'):
    # 	untrimmed_docs.append(p)
    # 	docs.append(clean_text(p))
    # untrimmed_docs.append(data)
    #docs.append(clean_text(data))
    tt = nltk.tokenize.texttiling.TextTilingTokenizer()
    for segment in tt.tokenize(data):
        docs.append(clean_text(segment))

    if corpus == "gutenberg":
        one_doc = []
        for i in gt.fileids():
            one_doc = one_doc + [
                x.lower() for x in rt.words(i) if not x.lower() in stopwords
            ]
            #docs.append(cleaned_doc)
        docs.append(one_doc)
    if corpus == "reuters":
        one_doc = list()
        for i in rt.fileids():
            #cleaned_doc = [x.lower() for x in rt.words(i) if not x.lower() in stopwords]
            #docs.append(cleaned_doc)
            one_doc = list(
                set(one_doc + [
                    x.lower()
                    for x in rt.words(i) if not x.lower() in stopwords
                ]))
        docs.append(one_doc)
    if corpus == "brown":
        categories = [
            'adventure', 'belles_lettres', 'editorial', 'fiction',
            'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery',
            'news', 'religion', 'reviews', 'romance', 'science_fiction'
        ]
        #categories = ['adventure', 'editorial', 'fiction', 'hobbies', 'learned', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
        for c in categories:
            cleaned_doc = [
                x.lower() for x in brown.words(categories=c)
                if not x.lower() in stopwords
            ]
            docs.append(cleaned_doc)
        #cleaned_doc = [x.lower() for x in brown.words(categories=categories) if not x.lower() in stopwords]
        #docs.append(cleaned_doc)
    my_dictionary = corpora.Dictionary(docs)
    return my_dictionary
def reuter():
    '''
    Create the reuters corpus from nltk 
    '''
    word = []
    for i in reuters.fileids():
        word.append(reuters.words(fileids=[i]))
    return word