label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of BULATS dataset') plt.show() return model if __name__ == "__main__": PATH = "model.pickle" # Loading speech features speech = pd.read_csv("/ExamplePath.csv") if not os.path.exists(PATH): nli = CategorizedPlaintextCorpusReader(CORPUS, DOC_PATTERN, cat_pattern=CAT_PATTERN) # since `nli` already has all the information (text and ids) # you don't need to iterate over it multiple times so # construct `X` and `y` in one go. X = [] y = [] for fileid in nli.fileids(): X.append({ 'text': nli.raw(fileid), 'id': fileid.split('/')[-1].split('.')[0] }) y.append(nli.categories(fileid)[0]) clf = PCA(n_components=2) model = build_and_evaluate(X, y, clf, speech)
words = word_tokenize(raw) words = corpus.words(fileid) clean0 = [word for word in words if word not in stoplist] """ bloblist = corpus.fileids() #bloblist = corpus.fileids(categories='2016') M=len(bloblist) # Look at the categories corpus.categories() # for each file in the corpus for fileid in bloblist: raw = corpus.raw(fileid) raw = raw.replace("N.H.S.", "NHS") raw = raw.replace("per cent", "%") raw = raw.replace("votes", "vote") raw = raw.replace("voted", "vote") words = word_tokenize(raw) # Bring in the default English NLTK stop words stoplist = stopwords.words('english') # Define additional stopwords in a string this will preserve the word image (without capital) mid sentence additional_stopwords = """also one The Media playback is unsupported on your device caption Image Images copyright Reuters AP Getty EPA said BBC""" # Split the additional stopwords string on each word and then add # those words to the NLTK stopwords list stoplist += additional_stopwords.split() # words = corpus.words(fileid) #fdist = nltk.FreqDist(words)