Пример #1
0
from nltk import download_shell

download_shell()
Пример #2
0
# In[1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')

# In[2]:

import nltk

# In[ ]:

nltk.download_shell()  #for downloading stopwords package

# In[ ]:

#getting the dataset of messages

# In[5]:

messages = pd.read_csv('SMSSpamCollection',
                       sep='\t',
                       names=['label', 'message'])

# In[6]:

print(len(messages))
Пример #3
0
# compares their features. Use Bag of Words and improve it by adjusting word
# counts based on their frequency in corpus (the group of all the documents)
# with TF-IDF (Term Frequency - Inverse Document Frequency).
# Term Frequency TF is the importance of the term within that document
# TD(d,t) = num of occurrences of term t in document d
# Inverse Document Frequency is the importance of the term in the corpus
# IDF(t) = log(D/t) D is total num of docs, t is num of docs with the term
# TF-IDF combines the term importance to the document and to all documents

# a collection of texts is sometimes called "corpus"

# intall nltk library if it is not installed
# we will use the stopwords file from this library
import nltk
# optionally, download the stopwords file to check its content
nltk.download_shell()
# enter d to inter download mode, or enter l to list out all files 
# enter stopwords to download the file, then enter q to quit the shell

# read sample messages and convert it to a list
file = './Documents/Workspace/Python-For-Data-Science/sample_SMSSpamCollection'
messages = [line.rstrip() for line in open(file)]

# check the number of messages and some specific message content
# notice there is a tab seperator (\t) between message class and content
print(len(messages))
messages[50]

# use enumerate method to number and print the first ten messages
# message_no is not part of the file but from the enumerate call
for message_no,message in enumerate(messages[:10]):
Пример #4
0
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""
import nltk

nltk.download_shell()  # download stopwords
#get the working directory and set the working directory
from os import chdir, getcwd

wd = getcwd()
wd
chdir(wd)

#rstrip() plus a list comprehension to get a list of all the lines of text messages
messages = [line.rstrip() for line in open('Desktop\SMSSpamCollection')]
print(len(messages))

#print the first ten messages and number them using enumerate
for message_no, message in enumerate(messages[:10]):
    print(message_no, message)
    print('\n')
# data is a tab separated file import using pandas
import pandas as pd

messages = pd.read_csv('Desktop/SMSSpamCollection',
                       sep='\t',
                       names=["label", "message"])
messages.head()