Пример #1
0
def api():
    content = request.json
    text = content["message"]
    nltk.download('punkt')
    nltk.download('stopwords')
    related_news = relatednews(text)
    NBVocab = open('NBVocab.pkl', 'rb')
    cv = joblib.load(NBVocab)
    model = open('model.pkl', 'rb')
    clf = joblib.load(model)
    ps = PorterStemmer()
    sw = set(stopwords.words('english'))
    sw.remove('not')
    sw.remove('no')
    sw.add('\n')
    text = text.lower()
    tokenizer = RegexpTokenizer('[A-z]+')
    word_list = tokenizer.tokenize(text)
    clean_list = [w for w in word_list if w not in sw]
    stemmed_list = [ps.stem(w) for w in clean_list]
    clean_text = ' '.join(stemmed_list)
    X_vec = cv.transform([clean_text])
    pred = clf.predict(X_vec)
    pred = pred[0]
    return jsonify({"prediction": pred, "related_news": related_news})
Пример #2
0
def pipeline_csv(headlines):
    headlines['headline'] = headlines['headline'].apply(nltk.word_tokenize)
    stemmer = PorterStemmer()
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [stemmer.stem(y) for y in x])
    lemmatizer = nltk.WordNetLemmatizer()
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [lemmatizer.lemmatize(y) for y in x])
    stopwords = nltk.corpus.stopwords.words('english')
    stemmed_stops = [stemmer.stem(t) for t in stopwords]
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [stemmer.stem(y) for y in x if y not in stemmed_stops])
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [e for e in x if len(e) >= 3])
    headlines['headline'] = headlines['headline'].str.join(" ")
    return headlines
Пример #3
0
 def stemming_phrases(self, phrase):
     porter_stemmer = PorterStemmer()
     tokenize = [docs for docs in phrase.split(" ")]
     stemmazied_phrase_lists = []
     for word in tokenize:
         stemmazied_phrase_lists.append(porter_stemmer.stem(word))
     return " ".join(stemmazied_phrase_lists)
Пример #4
0
def removepunct_tokenize_stem(text):
    text = "".join([ch for ch in text
                    if ch not in string.punctuation])  #Remove punctuation
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    final = [stemmer.stem(item) for item in tokens]
    return final
Пример #5
0
def doStemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = []
    for w in tokens:
        stemmed_tokens.append(ps.stem(w))

    return stemmed_tokens
Пример #6
0
def generate_frequencies(labeled_data,  filter_threshold=0.03):
    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')
    categories = dict()  # dict(category_name, {num_docs : int, counts : Counter(words)})
    # word_tokenize = lambda x: RegexpTokenizer(r'\w+').tokenize(x)

    for doc in labeled_data:
        category = doc["Category"].lower()  # some of the labels are inconsistent in case
        # if category == 'uninformative':
        #    continue
        if category not in categories.keys():
            categories[category] = {'num_docs': 1, 'counts': Counter()}
        else:
            categories[category]['num_docs'] += 1

        # use word_tokenize to parse words, make unique, remove stopwords
        # leaves non word things like '?', and "`", in input
        # NOTE: 2/27/20 -- Found forgot to call lower here
        message = doc["message"].lower().strip()
        message = word_tokenize(message)

        segmented_message = []
        for wd in message:
            segmented_message.append(wd)
            segments = wordsegment.segment(wd)
            if len(segments) > 1:
                segmented_message.extend(segments)

        processed_message = [stemmer.stem(wd) for wd in segmented_message
                             if wd not in stop_words and
                             sum(map((lambda x: 1 if x[1].isalnum() else 0),
                                     enumerate(wd))) > 0]

        for wd in processed_message:
            categories[category]['counts'][wd] += 1

    term_freqs = deepcopy(categories)
    doc_freqs = Counter()

    for cat in categories:
        category = categories[cat]
        for wd in category['counts']:

            # calculate term frequency % (within a single category)
            # Note: can also do number of times word appears across all categories
            count = category['counts'][wd]
            freq = count / category['num_docs']
            if freq < filter_threshold:
                del term_freqs[cat]['counts'][wd]
            # else:
                # print(cat, " : ('", wd, "', ", freq, ")", sep='')

            # Increase document frequency (here doc refers to category)
            # each word should appear only once per category,
            # so this counts number of categories a word appears in

            doc_freqs[wd] += 1

    return term_freqs, doc_freqs
Пример #7
0
def tokenizer_porter(text):
    """
    Split the comment text into a single word and extract the stem,
    :param text:
    :return:
    """
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]
Пример #8
0
def stemmer(tokens):
	'''
	Simple stemming loop for general use throughout project. Will stem all tokens in a list.
	'''
	ps = PorterStemmer()
	stemmed = list()
	for t in tokens:
		stemmed.append(ps.stem(t))
	return stemmed
Пример #9
0
def doStemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = []
    for w in tokens:
        stemmed_tokens.append(ps.stem(w))

    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAfter Stemming = ",
          stemmed_tokens)
    return stemmed_tokens
Пример #10
0
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix:
    matrix_loc = Path('data', name, 'tf_idf_matrix.pickle')

    if matrix_loc.exists():
        logger.info("Matrix exists! loading...")
        with matrix_loc.open('rb') as f:
            matrix = pickle.loads(f.read())
            return matrix

    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    texts = []
    for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."):
        text = tweet.text

        tokens = tokenizer.tokenize(text)
        text_proc = []
        for token in tokens:
            token = token.strip()
            if len(token) < 3:
                continue
            elif token in stopwords.words('english'):
                continue
            elif nlp_utils.match_url(token):
                continue
            elif token in string.punctuation:
                continue
            # elif token.startswith(("#", "$")):
            #     continue

            token = token.translate({ord(k): "" for k in string.punctuation})
            token = stemmer.stem(token)

            token = token.strip()
            if token == "":
                continue

            text_proc.append(token)

        texts.append(text_proc)

    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=lambda x: x,
                                 lowercase=False)
    m = vectorizer.fit_transform(texts)

    logger.info("Saving computed matrix...")
    with matrix_loc.open('wb') as f:
        f.write(pickle.dumps(m))

    return m
Пример #11
0
def textprocessing(text):
    text = str(text)
    stemmer = PorterStemmer()
    text.replace('`', "")
    text.replace("\"", "")
    re_sp = re.sub(r'\s*(?:([^a-zA-Z0-9._\s "])|\b(?:[a-z])\b)', " ",
                   text.lower())
    text = re.sub("[!@#$%\n^'*)\\(-=]", " ", re_sp)
    no_char = ' '.join([w for w in text.split() if len(w) > 3]).strip()
    filtered_sp = [
        w for w in no_char.split(" ") if not w in stopwords.words('english')
    ]
    stemmed_sp = [stemmer.stem(item) for item in filtered_sp]
    filtered_sp = ' '.join([x for x in filtered_sp])
    return filtered_sp
Пример #12
0
def stemmed_words(doc):
    '''
    This function is normally called as the sklearn vectorizer's analyzer
    so that tokenize can be performed when a vectorizer is initialized
    
    Inputs:
    doc: the untokenized text body of a document
    
    Returns:
    The tokenized version of the document
    
    E.g. vectorizer = CountVectorizer(lowercase=True, analyzer=stemmed_words)
    '''
    stemmer = PorterStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return (stemmer.stem(w) for w in analyzer(doc))
Пример #13
0
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = ["no", "nor", "not", "over", "under", "again", "further",
                        "but", "against", "too", "very"]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    print(stopw)
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
Пример #14
0
def process_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLS
    text = re.sub(r'^https?:\/\/.*[\r\n]*', "", text)

    # Extract Alfanumberic Tokens
    tokens = re.findall(r'\w+', text)

    # Remove Stopwords
    list_stopwords = stopwords.words("portuguese")
    tokens = [word for word in tokens if word not in list_stopwords]

    # Stemming
    snow_stemmer = PorterStemmer()
    tokens = [snow_stemmer.stem(word) for word in tokens]

    return " ".join(tokens)
Пример #15
0
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = [
        "no", "nor", "not", "over", "under", "again", "further", "but",
        "against", "too", "very"
    ]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    print(stopw)
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
Пример #16
0
def getCleanedReview(review):
    review = review.replace('<br /><br />', " ")

    # Tokenization of text
    tokenizer = RegexpTokenizer(r'\w+')
    wordsList = tokenizer.tokenize(review)
    wordsList = [word.lower() for word in wordsList]

    # Removing stopwords
    sw = stopwords.words('english')
    sw = set(sw)
    wordsList = [word for word in wordsList if word not in sw]

    # Text stemming
    ps = PorterStemmer()
    wordsList = [ps.stem(word) for word in wordsList]
    # print(wordsList)

    # Return clean review
    cleaned_review = " ".join(wordsList)

    return cleaned_review
Пример #17
0
st.title('Sentiment Analysis using Python')
import pandas as pd
df = pd.read_csv('Re_Data.csv')
df = df.iloc[0:5000]
import numpy as np
df = df.replace(np.nan, ' ', regex=True)
import string
df['clean_comment'] = df['clean_comment'].str.replace('[^\w\s]', '')
df.clean_comment = df.clean_comment.str.replace('\d+', '')
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem.snowball import PorterStemmer
ps = PorterStemmer()
for i in range(0, len(df)):
    df.clean_comment[i] = ps.stem(
        df.clean_comment[i])  # for word in df.clean_comment[i]]
x = df['category'].values
y = df['clean_comment'].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
import numpy as np
np.unique(y_train, return_counts=True)
np.unique(y_test, return_counts=True)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
text_model = Pipeline([('tfidf', TfidfVectorizer()), ('model', SVC())])
text_model.fit(x_train, y_train)
Пример #18
0
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
ps = PorterStemmer()
corpus = []

# Data Preprocessing
for i in range(0, len(Messages)):
    sentence = re.sub('[^a-zA-Z]', ' ', Messages['Message'][i])
    sentence = sentence.lower()
    sentence = sentence.split()

    words = [
        ps.stem(word) for word in sentence
        if word not in stopwords.words("english")
    ]
    words = ' '.join(words)
    corpus.append(words)

#Creating the BOW

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(Messages['labels'])
y = y.iloc[:, 0].values

from sklearn.model_selection import train_test_split
Пример #19
0
            # special puntuation
            content = content.replace('’re', ' are')
            content = content.replace('n’t', ' not')
            content = content.replace('s’', 's')
            content = content.replace('-', ' ')

            # Remove stop words
            # content = [w for w in content if not w in stopwords.words("english")]
            for stop_word in stop_words:
                content = content.replace(stop_word, ' ')

            # Steming
            stemmer = PorterStemmer()
            words = word_tokenize(content)
            stem_words = [stemmer.stem(w) for w in words]
            content = " ".join(stem_words)

            # write the preprocessed content
            outputFile.write(content)

# The words in all documents
word_list = []

# Iterate over the directories to find the words in all the documents
for directory in directories:
    for filename in np.ravel(
            df_categories['file'].loc[df_categories['category'] == directory]):

        # Setting up the relative path for the file
        filename = directories[directory] + str(filename) + '.txt'
useful_words = filter_words(word_list)
# print(useful_words)

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")
sentence = "send the 50 documents to abc, def, ghi."
# print(tokenizer.tokenize(sentence))

# ******************** STEMMING *****************************
# -process that transforms particular words into roo words  *
# -jumping, jumps, jump, jumped -> jump                     *
# ***********************************************************

text = "The quick brown fox was seen jumping over the lazy dog from high wall. Foxes love to make jumps."

word_list = tokenizer.tokenize(text.lower())
# print(word_list)

# ****** TYPES OF STEMMERS **********
# -Snowball stemmer (Multilingual)  *
# -Porter stemmer                   *
# -Lancaster stemmer                *
# ***********************************

from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
ps = PorterStemmer()
print(ps.stem("crowded"))
            processed_text.append({s[0]: tag})
print(processed_text)
execution_time = time() - start_time
print(str(timedelta(seconds=execution_time)))
print()

snowball_stemmer = SnowballStemmer("spanish")
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmed_list = list()
porter_stemmed_list = list()
lemmatized_list = list()
for word in tokenized_text:
    stemmed_word = snowball_stemmer.stem(word)
    snowball_stemmed_list.append(stemmed_word)
    stemmed_word = porter_stemmer.stem(word)
    porter_stemmed_list.append(stemmed_word)
    lemmatized_word = wordnet_lemmatizer.lemmatize(word)
    lemmatized_list.append(lemmatized_word)
print(snowball_stemmed_list)
print()
print(porter_stemmed_list)
print()
print(lemmatized_list)
print()
start_time = time()
tagged_text = sum(spanish_pos_tagger.tag_sents([snowball_stemmed_list]), [])
processed_text = []

for s in tagged_text:
    for tag in eagles_standard:
            lines[i] = pattern.sub('', lines[i])
    return lines

if __name__ == "__main__":
    not_stopw = ["no", "nor", "not", "over", "under", "again", "further",
            "but", "against", "too", "very"]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    stemmer = Stemmer()
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    script, fin, fout = sys.argv
    with open(fin, 'r') as f_in:
        lines = f_in.readlines()
        grades = []
        for i in range(len(lines)):
            line = lines[i].split("\t")
            grades.append(line[0])
            lines[i] = line[1].replace("\n", "")
            lines[i] = cls(lines[i])
        for i in range(len(lines)):
            lines[i] = lines[i].replace("n't", " not")
        for i in range(len(lines)):
            lines[i] = lines[i].lower()
            lines[i] = pattern.sub('', lines[i])
            lines[i] = " ".join([stemmer.stem(w) for w in lines[i].split()])
        lines = clean_occurences(lines)
        with open(fout, 'w') as f_out:
            for i in range(len(lines)):
                f_out.write(grades[i] + "\t" + lines[i] + "\n")
Пример #23
0
text = "i am bothered by her very much"
text_words = word_tokenize(text)
print(text_words)
useful_text = remove_stopwords(text_words, sw)
print(useful_text)
# Tokenization Using Regular Expressions
tokenizer = RegexpTokenizer('[a-zA-Z@]+')
useful_text = tokenizer.tokenize(sentence)
print(useful_text)

# Stemming
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
lovely dog from a 6 feet high wall"""
ps = PorterStemmer()
print(ps.stem('jumping'))
print(ps.stem('jumped'))
print(ps.stem('jumps'))
# We see that the stemmer converts all three of the above into 'jump' only

# Lemmatization
# pos tells what kind of word it is like 'a' for adjective or 'v' for verb etcetra
wn = WordNetLemmatizer()
print(wn.lemmatize('jumping', pos='v'))
print(wn.lemmatize('jumped', pos='v'))
print(wn.lemmatize('jumps'))

# Building a Vocab and Vectorization
corpus = [
    'Indian cricket team will win the world cup says captain virat kohli. World Cup will be held at Sri Lanka this year',
    'We will win next Lok Sabha elections, says confident Indian PM.',
# In[10]:


from nltk.stem.snowball import SnowballStemmer , PorterStemmer


# In[11]:


ps=PorterStemmer()


# In[12]:


ps.stem('lovely')


# In[13]:


ps.stem('jumping')


# In[14]:


ps.stem('calling')


# In[15]:
Пример #25
0
class ApiClient(object):

    API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json"
    MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json"

    def __init__(self):
        self.api_key = KEY
        self.tokenizer = nltk.WordPunctTokenizer()
        self.stm = PorterStemmer()

    def _load(self, **kwargs):
        """
        Loads list of movies via filter
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.API_URL, params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def _load_movie(self, movie_id, **kwargs):
        """
        Loads extra movie information such as directors, genres, etc.
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.MOVIE_URL.format(str(movie_id)),
                                params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def normalize(self, text):
        tokens = list()
        for token in self.tokenizer.tokenize(text.lower()):

            # Excludes stopwords, punctuation; stemming
            if token in stopwords.words('english'):
                continue
            token = self.stm.stem(token)
            if token.isalpha():
                tokens.append(token)

        return tokens

    def get_extra_params(self, movie_id, movie):
        """
        Saves extra features of movie
        """
        m = self._load_movie(movie_id)
        if (m.has_key('genres') and m.has_key('runtime')
                and m.has_key('critics_consensus')
                and m.has_key('abridged_cast')
                and m.has_key('abridged_directors') and m.has_key('studio')):
            movie.genres = m.get("genres")
            movie.runtime = m.get("runtime")
            movie.critics_consensus = self.normalize(
                m.get("critics_consensus"))
            movie.abridged_cast_names = [
                ac['name'] for ac in m.get("abridged_cast")
            ]
            try:
                movie.first_director = m.get("abridged_directors")[0]['name']
            # This never happened: check type of exception
            except ValueError:
                return False
            movie.studio = m.get("studio")
            return True
        return False

    def search_movies(self, keyword, movie_ids, page_limit=50):
        #DBG
        logging.debug("Searching movies by keyword '%s'", keyword)

        # Get list of movies
        response = self._load(q=keyword, page_limit=1, page=1)
        n = response.get("total")

        # Load all 25 pages x 50 movies
        for i in xrange(min(n / page_limit, 25)):
            response = self._load(q=keyword, page_limit=page_limit, page=i + 1)
            if response:
                movies = response.get("movies")
                if movies:
                    for result in movies:
                        movie_id = result.get("id")
                        print movie_id

                        if not movie_id or movie_id in movie_ids:
                            continue
                        movie_ids.add(movie_id)

                        title = result.get("title")
                        synopsis = result.get("synopsis")
                        # Convert rating into linear scale [0-4]
                        rating = self.set_rating(result.get("mpaa_rating"))

                        if title and rating >= 0:
                            movie = Movie(movie_id, title)
                            if not synopsis:
                                movie.synopsis = ['EMPTY']
                            else:
                                movie.synopsis = self.normalize(synopsis)
                            movie.mpaa_rating = rating

                            # Load extra movie information
                            if self.get_extra_params(movie_id, movie):
                                yield movie

    @staticmethod
    def set_rating(rating):
        if rating == 'G':
            return 0
        elif rating == 'PG':
            return 1
        elif rating == 'PG-13':
            return 2
        elif rating == 'R':
            return 3
        elif rating == 'NC-17':
            return 4
        else:
            return -1
Пример #26
0
    for row in csv_file:
        target.append(int(row[0])) # Class index
        data.append(row[2].encode('utf-8', 'ignore')) # Text description (ignore the entity name)
data = np.asarray(data)
target = np.asarray(target)
target = target - 1 # Labels starting from 0
print("Dataset DBPEDIA loaded...")
###############################################################################
### Pre-process the dataset
###############################################################################

print("Pre-processing the dataset...")
stemmer = PorterStemmer() # Define the type of stemmer to use
additional_stop_words = []
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []: # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
        processed_data.append(stemmed_doc)
data = processed_data
Пример #27
0
    return result


from sklearn.externals import joblib
import json

worddict = json.load(open('json_dict', 'r'))
file_list = joblib.load('file_list.sav')
word_1 = input()
word_2 = input()

table = str.maketrans("", "", string.punctuation)
word_1 = word_1.translate(table)
word_2 = word_2.translate(table)
stemmer = PorterStemmer()
word_1 = stemmer.stem(word_1)
word_2 = stemmer.stem(word_2)

l1 = worddict.get(word_1, [0, []])
l2 = worddict.get(word_2, [0, []])

print(len(l1[1]))
print(len(l2[1]))
print("OR : ", len(queryOR(l1[1], l2[1])))
print("OR : ", len(queryORMerge(l1[1], l2[1])[0]))

a = queryAND(l1[1], l2[1])
print("AND : ", len(a[0]), " ", a[1])
b = queryAND_Skip(l1[1], l2[1], 1)
print(len(b[0]), " ", b[1])
Пример #28
0
class ApiClient(object):

    API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json"
    MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json"

    def __init__(self):
        self.api_key = KEY
        self.tokenizer = nltk.WordPunctTokenizer()
        self.stm = PorterStemmer()

    def _load(self, **kwargs):
        """
        Loads list of movies via filter
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.API_URL, params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def _load_movie(self, movie_id, **kwargs):
        """
        Loads extra movie information such as directors, genres, etc.
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def normalize(self, text):
        tokens = list()
        for token in self.tokenizer.tokenize(text.lower()):

            # Excludes stopwords, punctuation; stemming
            if token in stopwords.words('english'):
                continue
            token = self.stm.stem(token)
            if token.isalpha():
                tokens.append(token)

        return tokens

    def get_extra_params(self, movie_id, movie):
        """
        Saves extra features of movie
        """
        m = self._load_movie(movie_id)
        if (m.has_key('genres') and
                m.has_key('runtime') and
                m.has_key('critics_consensus') and
                m.has_key('abridged_cast') and
                m.has_key('abridged_directors') and
                m.has_key('studio')):
            movie.genres = m.get("genres")
            movie.runtime = m.get("runtime")
            movie.critics_consensus = self.normalize(m.get("critics_consensus"))
            movie.abridged_cast_names = [ac['name'] for ac in m.get("abridged_cast")]
            try:
                movie.first_director = m.get("abridged_directors")[0]['name']
            # This never happened: check type of exception
            except ValueError:
                return False
            movie.studio = m.get("studio")                        
            return True
        return False

    def search_movies(self, keyword, movie_ids, page_limit=50):
        #DBG
        logging.debug("Searching movies by keyword '%s'", keyword)

        # Get list of movies
        response = self._load(q=keyword, page_limit=1, page=1)
        n = response.get("total")

        # Load all 25 pages x 50 movies
        for i in xrange(min(n/page_limit, 25)):
            response = self._load(q=keyword, page_limit=page_limit, page=i+1)
            if response:
                movies = response.get("movies")
                if movies:
                    for result in movies:
                        movie_id = result.get("id")
                        print movie_id

                        if not movie_id or movie_id in movie_ids:
                            continue
                        movie_ids.add(movie_id)

                        title = result.get("title")
                        synopsis = result.get("synopsis")
                        # Convert rating into linear scale [0-4]
                        rating = self.set_rating(result.get("mpaa_rating"))

                        if title and rating >= 0:
                            movie = Movie(movie_id, title)
                            if not synopsis:
                                movie.synopsis = ['EMPTY']
                            else:
                                movie.synopsis = self.normalize(synopsis)
                            movie.mpaa_rating = rating

                            # Load extra movie information
                            if self.get_extra_params(movie_id, movie):
                                yield movie

    @staticmethod
    def set_rating(rating):
        if rating == 'G':
            return 0
        elif rating == 'PG':
            return 1
        elif rating == 'PG-13':
            return 2
        elif rating == 'R':
            return 3
        elif rating == 'NC-17':
            return 4
        else:
            return -1
Пример #29
0
class TextProcessor:
    def __init__(self, corpus, expanded_urls):
        self.tokenizer = TweetTokenizer()
        self.stemmer = PorterStemmer()
        self.stopwords = stopwords.words('english')
        self.corpus = corpus
        self.expanded_urls = expanded_urls
        self.re_url = r'http\S+'
        self.punctuation = string.punctuation
        self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/'
        self.stanford_pos = StanfordPOSTagger(
            self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger',
            self.stanford_pos_pwd + 'stanford-postagger.jar')
        self.tag_vocab = defaultdict(Counter)
        self.tag_token = dict()
        self.vocab = defaultdict(set)
        self.tags = Counter()

    def __iter__(self):
        yield from self.process()

    def process(self):
        for tokens in self.stanford_pos.tag_sents(self.tokenseq_generator()):
            #for tokens in self.tokenseq_generator():
            res = []
            for token, tag in tokens:
                #for token in tokens:
                processed = self.process_token(token)
                if processed:
                    #most_similar = self.w2v.most_similar(token)
                    self.tag_vocab[processed].update({tag: 1})
                    self.tag_token[token] = tag
                    self.tags.update({tag: 1})

                    res.append(processed)
            if res:
                yield res

    @staticmethod
    def clean_url(url):
        spl = urlsplit(url)
        spl = urlsplit(spl.geturl())
        return urlunsplit((spl[0], spl[1], spl[2], '', ''))

    def process_token(self, token):
        if re.match(self.re_url, token):
            return TextProcessor.clean_url(self.expanded_urls.get(
                token, token))

        t = token.lower()
        #t = token

        if t in self.stopwords or t in self.punctuation:
            return None

        if len(t) < 3 or t.startswith('@'):
            return None

        if not t.startswith('#'):
            t = t.translate({ord(k): "" for k in self.punctuation})

        t = self.stemmer.stem(t)

        self.vocab[t].add(token)
        return t

    def tokenseq_generator(self):
        for text in self.corpus:
            yield self.tokenizer.tokenize(text)
Пример #30
0
from nltk.stem.snowball import PorterStemmer
import food_detection_root
import os
import codecs

stemmer = PorterStemmer()
path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
what_food_list_file = codecs.open(path + "list - what_food.txt",
                                  encoding='utf-8')
what_food_list = what_food_list_file.read().splitlines()
stemmed_list = list()
what_food_list_file.close()
for word in what_food_list:
    stemmed_word = stemmer.stem(word)
    stemmed_list.append(stemmed_word)
what_food_stemmed_list_file = codecs.open(path +
                                          "list - stemmed_what_food.txt",
                                          encoding='utf-8',
                                          mode='a')
for word in stemmed_list:
    what_food_stemmed_list_file.write(word + "\n")
what_food_stemmed_list_file.close()
from nltk.stem.snowball import SnowballStemmer

ess = SnowballStemmer('english', ignore_stopwords=True)

print(ess.stem('flies'))

fss = SnowballStemmer('french', ignore_stopwords=True)
print(fss.stem('courais'))

from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

print(ess.stem('teeth'))

ps = PorterStemmer()
print(ps.stem('teeth'))

ls = LancasterStemmer()
print(ls.stem('teeth'))

print(ps.stem('teen'))
print(ps.stem('teenager'))

print(ls.stem('teen'))
print(ls.stem('teenager'))

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
vectorized_corpus = cv.fit_transform(imdb_df.review)
print(vectorized_corpus.todense())
Пример #32
0
_20news = fetch_20newsgroups(subset="all")
print("Dataset 20NEWS loaded...")
data = _20news.data
target = _20news.target
###############################################################################
# Pre-process the dataset
###############################################################################
print("Pre-processing the dataset...")
stemmer = PorterStemmer()  # Define the type of stemmer to use
additional_stop_words = [
    'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization',
    'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks',
    'don', 'know', 'help', 'use', 'copy'
]
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words
                  ])  # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []:  # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
        processed_data.append(stemmed_doc)
Пример #33
0
for corpus_idx in [1, 2, 3, 4, 5]:
    corpus_name = 'corpus'+str(corpus_idx)
    print('--------------------- Start Process %s ------------------------' % corpus_name)
    # initialize path
    corpus_path = retval + '/../corpus/' + corpus_name + '/'
    output_path = retval + '/../vector_model/' + corpus_name + '/'
    vocabulary_path = '/home/zhans/nltk_data/corpora/words/my_vocabulary'

    #nltk.corpus.words
    vocabulary_list = open(vocabulary_path, 'r').read().split()

    # set stemming or without stemming
    if stem_flag:
        stemmer = PorterStemmer()
        vocabulary_stem_list = [stemmer.stem(x) for x in vocabulary_list]
        vocabulary_list = list(set(vocabulary_stem_list))

    vocabulary_length = len(vocabulary_list)

    # clean the raw text and generate a clean word list for each document
    corpus_clean = {}
    print('--------------------- Clean document ------------------------')
    for document_name in os.listdir(corpus_path):
        document_path = corpus_path + document_name
        document = open(document_path, 'r').read()
        doc_clean_word = clean_words(document, stem_flag)
        # save the clean word of document into a dictionary
        corpus_clean[document_name] = doc_clean_word
    print("Clean document done!")
    print('--------- Initial values for vector representation ----------')