示例#1
0
                    label=target_name)
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title('PCA of BULATS dataset')
    plt.show()

    return model


if __name__ == "__main__":
    PATH = "model.pickle"
    # Loading speech features
    speech = pd.read_csv("/ExamplePath.csv")

    if not os.path.exists(PATH):
        nli = CategorizedPlaintextCorpusReader(CORPUS,
                                               DOC_PATTERN,
                                               cat_pattern=CAT_PATTERN)
        # since `nli` already has all the information (text and ids)
        # you don't need to iterate over it multiple times so
        # construct `X` and `y` in one go.
        X = []
        y = []
        for fileid in nli.fileids():
            X.append({
                'text': nli.raw(fileid),
                'id': fileid.split('/')[-1].split('.')[0]
            })
            y.append(nli.categories(fileid)[0])
        clf = PCA(n_components=2)
        model = build_and_evaluate(X, y, clf, speech)
示例#2
0
words = word_tokenize(raw)
words = corpus.words(fileid)
clean0 = [word for word in words if word not in stoplist]
"""

bloblist = corpus.fileids()
#bloblist = corpus.fileids(categories='2016')
M=len(bloblist)
# Look at the categories
corpus.categories()

    
# for each file in the corpus

for fileid in bloblist:
    raw = corpus.raw(fileid)
    raw = raw.replace("N.H.S.", "NHS")
    raw = raw.replace("per cent", "%")
    raw = raw.replace("votes", "vote")
    raw = raw.replace("voted", "vote")
    words = word_tokenize(raw)
    # Bring in the default English NLTK stop words
    stoplist = stopwords.words('english')
    # Define additional stopwords in a string this will preserve the word image (without capital) mid sentence
    additional_stopwords = """also one The Media playback is unsupported on your device caption Image Images copyright Reuters AP Getty EPA said BBC"""
    # Split the additional stopwords string on each word and then add
    # those words to the NLTK stopwords list
    stoplist += additional_stopwords.split()
#    words = corpus.words(fileid)

    #fdist = nltk.FreqDist(words)