def api(): content = request.json text = content["message"] nltk.download('punkt') nltk.download('stopwords') related_news = relatednews(text) NBVocab = open('NBVocab.pkl', 'rb') cv = joblib.load(NBVocab) model = open('model.pkl', 'rb') clf = joblib.load(model) ps = PorterStemmer() sw = set(stopwords.words('english')) sw.remove('not') sw.remove('no') sw.add('\n') text = text.lower() tokenizer = RegexpTokenizer('[A-z]+') word_list = tokenizer.tokenize(text) clean_list = [w for w in word_list if w not in sw] stemmed_list = [ps.stem(w) for w in clean_list] clean_text = ' '.join(stemmed_list) X_vec = cv.transform([clean_text]) pred = clf.predict(X_vec) pred = pred[0] return jsonify({"prediction": pred, "related_news": related_news})
def pipeline_csv(headlines): headlines['headline'] = headlines['headline'].apply(nltk.word_tokenize) stemmer = PorterStemmer() headlines['headline'] = headlines['headline'].apply( lambda x: [stemmer.stem(y) for y in x]) lemmatizer = nltk.WordNetLemmatizer() headlines['headline'] = headlines['headline'].apply( lambda x: [lemmatizer.lemmatize(y) for y in x]) stopwords = nltk.corpus.stopwords.words('english') stemmed_stops = [stemmer.stem(t) for t in stopwords] headlines['headline'] = headlines['headline'].apply( lambda x: [stemmer.stem(y) for y in x if y not in stemmed_stops]) headlines['headline'] = headlines['headline'].apply( lambda x: [e for e in x if len(e) >= 3]) headlines['headline'] = headlines['headline'].str.join(" ") return headlines
def stemming_phrases(self, phrase): porter_stemmer = PorterStemmer() tokenize = [docs for docs in phrase.split(" ")] stemmazied_phrase_lists = [] for word in tokenize: stemmazied_phrase_lists.append(porter_stemmer.stem(word)) return " ".join(stemmazied_phrase_lists)
def removepunct_tokenize_stem(text): text = "".join([ch for ch in text if ch not in string.punctuation]) #Remove punctuation tokens = word_tokenize(text) stemmer = PorterStemmer() final = [stemmer.stem(item) for item in tokens] return final
def doStemming(tokens): ps = PorterStemmer() stemmed_tokens = [] for w in tokens: stemmed_tokens.append(ps.stem(w)) return stemmed_tokens
def generate_frequencies(labeled_data, filter_threshold=0.03): stemmer = PorterStemmer() stop_words = stopwords.words('english') categories = dict() # dict(category_name, {num_docs : int, counts : Counter(words)}) # word_tokenize = lambda x: RegexpTokenizer(r'\w+').tokenize(x) for doc in labeled_data: category = doc["Category"].lower() # some of the labels are inconsistent in case # if category == 'uninformative': # continue if category not in categories.keys(): categories[category] = {'num_docs': 1, 'counts': Counter()} else: categories[category]['num_docs'] += 1 # use word_tokenize to parse words, make unique, remove stopwords # leaves non word things like '?', and "`", in input # NOTE: 2/27/20 -- Found forgot to call lower here message = doc["message"].lower().strip() message = word_tokenize(message) segmented_message = [] for wd in message: segmented_message.append(wd) segments = wordsegment.segment(wd) if len(segments) > 1: segmented_message.extend(segments) processed_message = [stemmer.stem(wd) for wd in segmented_message if wd not in stop_words and sum(map((lambda x: 1 if x[1].isalnum() else 0), enumerate(wd))) > 0] for wd in processed_message: categories[category]['counts'][wd] += 1 term_freqs = deepcopy(categories) doc_freqs = Counter() for cat in categories: category = categories[cat] for wd in category['counts']: # calculate term frequency % (within a single category) # Note: can also do number of times word appears across all categories count = category['counts'][wd] freq = count / category['num_docs'] if freq < filter_threshold: del term_freqs[cat]['counts'][wd] # else: # print(cat, " : ('", wd, "', ", freq, ")", sep='') # Increase document frequency (here doc refers to category) # each word should appear only once per category, # so this counts number of categories a word appears in doc_freqs[wd] += 1 return term_freqs, doc_freqs
def tokenizer_porter(text): """ Split the comment text into a single word and extract the stem, :param text: :return: """ porter = PorterStemmer() return [porter.stem(word) for word in text.split()]
def stemmer(tokens): ''' Simple stemming loop for general use throughout project. Will stem all tokens in a list. ''' ps = PorterStemmer() stemmed = list() for t in tokens: stemmed.append(ps.stem(t)) return stemmed
def doStemming(tokens): ps = PorterStemmer() stemmed_tokens = [] for w in tokens: stemmed_tokens.append(ps.stem(w)) print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAfter Stemming = ", stemmed_tokens) return stemmed_tokens
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix: matrix_loc = Path('data', name, 'tf_idf_matrix.pickle') if matrix_loc.exists(): logger.info("Matrix exists! loading...") with matrix_loc.open('rb') as f: matrix = pickle.loads(f.read()) return matrix stemmer = PorterStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True) texts = [] for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."): text = tweet.text tokens = tokenizer.tokenize(text) text_proc = [] for token in tokens: token = token.strip() if len(token) < 3: continue elif token in stopwords.words('english'): continue elif nlp_utils.match_url(token): continue elif token in string.punctuation: continue # elif token.startswith(("#", "$")): # continue token = token.translate({ord(k): "" for k in string.punctuation}) token = stemmer.stem(token) token = token.strip() if token == "": continue text_proc.append(token) texts.append(text_proc) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=lambda x: x, lowercase=False) m = vectorizer.fit_transform(texts) logger.info("Saving computed matrix...") with matrix_loc.open('wb') as f: f.write(pickle.dumps(m)) return m
def textprocessing(text): text = str(text) stemmer = PorterStemmer() text.replace('`', "") text.replace("\"", "") re_sp = re.sub(r'\s*(?:([^a-zA-Z0-9._\s "])|\b(?:[a-z])\b)', " ", text.lower()) text = re.sub("[!@#$%\n^'*)\\(-=]", " ", re_sp) no_char = ' '.join([w for w in text.split() if len(w) > 3]).strip() filtered_sp = [ w for w in no_char.split(" ") if not w in stopwords.words('english') ] stemmed_sp = [stemmer.stem(item) for item in filtered_sp] filtered_sp = ' '.join([x for x in filtered_sp]) return filtered_sp
def stemmed_words(doc): ''' This function is normally called as the sklearn vectorizer's analyzer so that tokenize can be performed when a vectorizer is initialized Inputs: doc: the untokenized text body of a document Returns: The tokenized version of the document E.g. vectorizer = CountVectorizer(lowercase=True, analyzer=stemmed_words) ''' stemmer = PorterStemmer() analyzer = TfidfVectorizer().build_analyzer() return (stemmer.stem(w) for w in analyzer(doc))
def extract_keywords(sentence): sentence = sentence.lower() not_stopw = ["no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very"] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) print(stopw) pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') sentence = sentence.replace('\n', '') sentence = sentence.replace("n't", " not") sentence = clean_string(sentence) sentence = pattern.sub('', sentence) stemmer = Stemmer() s = [stemmer.stem(w) for w in sentence.split()] b = zip(*[s[i:] for i in [0, 1]]) b = [bigram[0] + " " + bigram[1] for bigram in b] return s + b
def process_text(text): # Lowercase text = text.lower() # Remove URLS text = re.sub(r'^https?:\/\/.*[\r\n]*', "", text) # Extract Alfanumberic Tokens tokens = re.findall(r'\w+', text) # Remove Stopwords list_stopwords = stopwords.words("portuguese") tokens = [word for word in tokens if word not in list_stopwords] # Stemming snow_stemmer = PorterStemmer() tokens = [snow_stemmer.stem(word) for word in tokens] return " ".join(tokens)
def extract_keywords(sentence): sentence = sentence.lower() not_stopw = [ "no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very" ] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) print(stopw) pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') sentence = sentence.replace('\n', '') sentence = sentence.replace("n't", " not") sentence = clean_string(sentence) sentence = pattern.sub('', sentence) stemmer = Stemmer() s = [stemmer.stem(w) for w in sentence.split()] b = zip(*[s[i:] for i in [0, 1]]) b = [bigram[0] + " " + bigram[1] for bigram in b] return s + b
def getCleanedReview(review): review = review.replace('<br /><br />', " ") # Tokenization of text tokenizer = RegexpTokenizer(r'\w+') wordsList = tokenizer.tokenize(review) wordsList = [word.lower() for word in wordsList] # Removing stopwords sw = stopwords.words('english') sw = set(sw) wordsList = [word for word in wordsList if word not in sw] # Text stemming ps = PorterStemmer() wordsList = [ps.stem(word) for word in wordsList] # print(wordsList) # Return clean review cleaned_review = " ".join(wordsList) return cleaned_review
st.title('Sentiment Analysis using Python') import pandas as pd df = pd.read_csv('Re_Data.csv') df = df.iloc[0:5000] import numpy as np df = df.replace(np.nan, ' ', regex=True) import string df['clean_comment'] = df['clean_comment'].str.replace('[^\w\s]', '') df.clean_comment = df.clean_comment.str.replace('\d+', '') import nltk nltk.download('punkt') nltk.download('wordnet') from nltk.stem.snowball import PorterStemmer ps = PorterStemmer() for i in range(0, len(df)): df.clean_comment[i] = ps.stem( df.clean_comment[i]) # for word in df.clean_comment[i]] x = df['category'].values y = df['clean_comment'].values from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) import numpy as np np.unique(y_train, return_counts=True) np.unique(y_test, return_counts=True) from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC text_model = Pipeline([('tfidf', TfidfVectorizer()), ('model', SVC())]) text_model.fit(x_train, y_train)
import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.snowball import PorterStemmer ps = PorterStemmer() corpus = [] # Data Preprocessing for i in range(0, len(Messages)): sentence = re.sub('[^a-zA-Z]', ' ', Messages['Message'][i]) sentence = sentence.lower() sentence = sentence.split() words = [ ps.stem(word) for word in sentence if word not in stopwords.words("english") ] words = ' '.join(words) corpus.append(words) #Creating the BOW from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=5000) X = cv.fit_transform(corpus).toarray() y = pd.get_dummies(Messages['labels']) y = y.iloc[:, 0].values from sklearn.model_selection import train_test_split
# special puntuation content = content.replace('’re', ' are') content = content.replace('n’t', ' not') content = content.replace('s’', 's') content = content.replace('-', ' ') # Remove stop words # content = [w for w in content if not w in stopwords.words("english")] for stop_word in stop_words: content = content.replace(stop_word, ' ') # Steming stemmer = PorterStemmer() words = word_tokenize(content) stem_words = [stemmer.stem(w) for w in words] content = " ".join(stem_words) # write the preprocessed content outputFile.write(content) # The words in all documents word_list = [] # Iterate over the directories to find the words in all the documents for directory in directories: for filename in np.ravel( df_categories['file'].loc[df_categories['category'] == directory]): # Setting up the relative path for the file filename = directories[directory] + str(filename) + '.txt'
useful_words = filter_words(word_list) # print(useful_words) from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer("[a-zA-Z0-9]+") sentence = "send the 50 documents to abc, def, ghi." # print(tokenizer.tokenize(sentence)) # ******************** STEMMING ***************************** # -process that transforms particular words into roo words * # -jumping, jumps, jump, jumped -> jump * # *********************************************************** text = "The quick brown fox was seen jumping over the lazy dog from high wall. Foxes love to make jumps." word_list = tokenizer.tokenize(text.lower()) # print(word_list) # ****** TYPES OF STEMMERS ********** # -Snowball stemmer (Multilingual) * # -Porter stemmer * # -Lancaster stemmer * # *********************************** from nltk.stem.snowball import PorterStemmer, SnowballStemmer from nltk.stem.lancaster import LancasterStemmer ps = PorterStemmer() print(ps.stem("crowded"))
processed_text.append({s[0]: tag}) print(processed_text) execution_time = time() - start_time print(str(timedelta(seconds=execution_time))) print() snowball_stemmer = SnowballStemmer("spanish") porter_stemmer = PorterStemmer() wordnet_lemmatizer = WordNetLemmatizer() snowball_stemmed_list = list() porter_stemmed_list = list() lemmatized_list = list() for word in tokenized_text: stemmed_word = snowball_stemmer.stem(word) snowball_stemmed_list.append(stemmed_word) stemmed_word = porter_stemmer.stem(word) porter_stemmed_list.append(stemmed_word) lemmatized_word = wordnet_lemmatizer.lemmatize(word) lemmatized_list.append(lemmatized_word) print(snowball_stemmed_list) print() print(porter_stemmed_list) print() print(lemmatized_list) print() start_time = time() tagged_text = sum(spanish_pos_tagger.tag_sents([snowball_stemmed_list]), []) processed_text = [] for s in tagged_text: for tag in eagles_standard:
lines[i] = pattern.sub('', lines[i]) return lines if __name__ == "__main__": not_stopw = ["no", "nor", "not", "over", "under", "again", "further", "but", "against", "too", "very"] stopw = stopwords.words('english') for x in not_stopw: stopw.remove(x) stemmer = Stemmer() pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*') script, fin, fout = sys.argv with open(fin, 'r') as f_in: lines = f_in.readlines() grades = [] for i in range(len(lines)): line = lines[i].split("\t") grades.append(line[0]) lines[i] = line[1].replace("\n", "") lines[i] = cls(lines[i]) for i in range(len(lines)): lines[i] = lines[i].replace("n't", " not") for i in range(len(lines)): lines[i] = lines[i].lower() lines[i] = pattern.sub('', lines[i]) lines[i] = " ".join([stemmer.stem(w) for w in lines[i].split()]) lines = clean_occurences(lines) with open(fout, 'w') as f_out: for i in range(len(lines)): f_out.write(grades[i] + "\t" + lines[i] + "\n")
text = "i am bothered by her very much" text_words = word_tokenize(text) print(text_words) useful_text = remove_stopwords(text_words, sw) print(useful_text) # Tokenization Using Regular Expressions tokenizer = RegexpTokenizer('[a-zA-Z@]+') useful_text = tokenizer.tokenize(sentence) print(useful_text) # Stemming text = """Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6 feet high wall""" ps = PorterStemmer() print(ps.stem('jumping')) print(ps.stem('jumped')) print(ps.stem('jumps')) # We see that the stemmer converts all three of the above into 'jump' only # Lemmatization # pos tells what kind of word it is like 'a' for adjective or 'v' for verb etcetra wn = WordNetLemmatizer() print(wn.lemmatize('jumping', pos='v')) print(wn.lemmatize('jumped', pos='v')) print(wn.lemmatize('jumps')) # Building a Vocab and Vectorization corpus = [ 'Indian cricket team will win the world cup says captain virat kohli. World Cup will be held at Sri Lanka this year', 'We will win next Lok Sabha elections, says confident Indian PM.',
# In[10]: from nltk.stem.snowball import SnowballStemmer , PorterStemmer # In[11]: ps=PorterStemmer() # In[12]: ps.stem('lovely') # In[13]: ps.stem('jumping') # In[14]: ps.stem('calling') # In[15]:
class ApiClient(object): API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json" MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json" def __init__(self): self.api_key = KEY self.tokenizer = nltk.WordPunctTokenizer() self.stm = PorterStemmer() def _load(self, **kwargs): """ Loads list of movies via filter """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.API_URL, params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def _load_movie(self, movie_id, **kwargs): """ Loads extra movie information such as directors, genres, etc. """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def normalize(self, text): tokens = list() for token in self.tokenizer.tokenize(text.lower()): # Excludes stopwords, punctuation; stemming if token in stopwords.words('english'): continue token = self.stm.stem(token) if token.isalpha(): tokens.append(token) return tokens def get_extra_params(self, movie_id, movie): """ Saves extra features of movie """ m = self._load_movie(movie_id) if (m.has_key('genres') and m.has_key('runtime') and m.has_key('critics_consensus') and m.has_key('abridged_cast') and m.has_key('abridged_directors') and m.has_key('studio')): movie.genres = m.get("genres") movie.runtime = m.get("runtime") movie.critics_consensus = self.normalize( m.get("critics_consensus")) movie.abridged_cast_names = [ ac['name'] for ac in m.get("abridged_cast") ] try: movie.first_director = m.get("abridged_directors")[0]['name'] # This never happened: check type of exception except ValueError: return False movie.studio = m.get("studio") return True return False def search_movies(self, keyword, movie_ids, page_limit=50): #DBG logging.debug("Searching movies by keyword '%s'", keyword) # Get list of movies response = self._load(q=keyword, page_limit=1, page=1) n = response.get("total") # Load all 25 pages x 50 movies for i in xrange(min(n / page_limit, 25)): response = self._load(q=keyword, page_limit=page_limit, page=i + 1) if response: movies = response.get("movies") if movies: for result in movies: movie_id = result.get("id") print movie_id if not movie_id or movie_id in movie_ids: continue movie_ids.add(movie_id) title = result.get("title") synopsis = result.get("synopsis") # Convert rating into linear scale [0-4] rating = self.set_rating(result.get("mpaa_rating")) if title and rating >= 0: movie = Movie(movie_id, title) if not synopsis: movie.synopsis = ['EMPTY'] else: movie.synopsis = self.normalize(synopsis) movie.mpaa_rating = rating # Load extra movie information if self.get_extra_params(movie_id, movie): yield movie @staticmethod def set_rating(rating): if rating == 'G': return 0 elif rating == 'PG': return 1 elif rating == 'PG-13': return 2 elif rating == 'R': return 3 elif rating == 'NC-17': return 4 else: return -1
for row in csv_file: target.append(int(row[0])) # Class index data.append(row[2].encode('utf-8', 'ignore')) # Text description (ignore the entity name) data = np.asarray(data) target = np.asarray(target) target = target - 1 # Labels starting from 0 print("Dataset DBPEDIA loaded...") ############################################################################### ### Pre-process the dataset ############################################################################### print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else: processed_data.append(stemmed_doc) data = processed_data
return result from sklearn.externals import joblib import json worddict = json.load(open('json_dict', 'r')) file_list = joblib.load('file_list.sav') word_1 = input() word_2 = input() table = str.maketrans("", "", string.punctuation) word_1 = word_1.translate(table) word_2 = word_2.translate(table) stemmer = PorterStemmer() word_1 = stemmer.stem(word_1) word_2 = stemmer.stem(word_2) l1 = worddict.get(word_1, [0, []]) l2 = worddict.get(word_2, [0, []]) print(len(l1[1])) print(len(l2[1])) print("OR : ", len(queryOR(l1[1], l2[1]))) print("OR : ", len(queryORMerge(l1[1], l2[1])[0])) a = queryAND(l1[1], l2[1]) print("AND : ", len(a[0]), " ", a[1]) b = queryAND_Skip(l1[1], l2[1], 1) print(len(b[0]), " ", b[1])
class ApiClient(object): API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json" MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json" def __init__(self): self.api_key = KEY self.tokenizer = nltk.WordPunctTokenizer() self.stm = PorterStemmer() def _load(self, **kwargs): """ Loads list of movies via filter """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.API_URL, params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def _load_movie(self, movie_id, **kwargs): """ Loads extra movie information such as directors, genres, etc. """ params = dict(kwargs) params["apikey"] = self.api_key response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json() if response and "Error" in response: raise ValueError(response.get("Error", "Unknown error")) else: return response def normalize(self, text): tokens = list() for token in self.tokenizer.tokenize(text.lower()): # Excludes stopwords, punctuation; stemming if token in stopwords.words('english'): continue token = self.stm.stem(token) if token.isalpha(): tokens.append(token) return tokens def get_extra_params(self, movie_id, movie): """ Saves extra features of movie """ m = self._load_movie(movie_id) if (m.has_key('genres') and m.has_key('runtime') and m.has_key('critics_consensus') and m.has_key('abridged_cast') and m.has_key('abridged_directors') and m.has_key('studio')): movie.genres = m.get("genres") movie.runtime = m.get("runtime") movie.critics_consensus = self.normalize(m.get("critics_consensus")) movie.abridged_cast_names = [ac['name'] for ac in m.get("abridged_cast")] try: movie.first_director = m.get("abridged_directors")[0]['name'] # This never happened: check type of exception except ValueError: return False movie.studio = m.get("studio") return True return False def search_movies(self, keyword, movie_ids, page_limit=50): #DBG logging.debug("Searching movies by keyword '%s'", keyword) # Get list of movies response = self._load(q=keyword, page_limit=1, page=1) n = response.get("total") # Load all 25 pages x 50 movies for i in xrange(min(n/page_limit, 25)): response = self._load(q=keyword, page_limit=page_limit, page=i+1) if response: movies = response.get("movies") if movies: for result in movies: movie_id = result.get("id") print movie_id if not movie_id or movie_id in movie_ids: continue movie_ids.add(movie_id) title = result.get("title") synopsis = result.get("synopsis") # Convert rating into linear scale [0-4] rating = self.set_rating(result.get("mpaa_rating")) if title and rating >= 0: movie = Movie(movie_id, title) if not synopsis: movie.synopsis = ['EMPTY'] else: movie.synopsis = self.normalize(synopsis) movie.mpaa_rating = rating # Load extra movie information if self.get_extra_params(movie_id, movie): yield movie @staticmethod def set_rating(rating): if rating == 'G': return 0 elif rating == 'PG': return 1 elif rating == 'PG-13': return 2 elif rating == 'R': return 3 elif rating == 'NC-17': return 4 else: return -1
class TextProcessor: def __init__(self, corpus, expanded_urls): self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.corpus = corpus self.expanded_urls = expanded_urls self.re_url = r'http\S+' self.punctuation = string.punctuation self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/' self.stanford_pos = StanfordPOSTagger( self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger', self.stanford_pos_pwd + 'stanford-postagger.jar') self.tag_vocab = defaultdict(Counter) self.tag_token = dict() self.vocab = defaultdict(set) self.tags = Counter() def __iter__(self): yield from self.process() def process(self): for tokens in self.stanford_pos.tag_sents(self.tokenseq_generator()): #for tokens in self.tokenseq_generator(): res = [] for token, tag in tokens: #for token in tokens: processed = self.process_token(token) if processed: #most_similar = self.w2v.most_similar(token) self.tag_vocab[processed].update({tag: 1}) self.tag_token[token] = tag self.tags.update({tag: 1}) res.append(processed) if res: yield res @staticmethod def clean_url(url): spl = urlsplit(url) spl = urlsplit(spl.geturl()) return urlunsplit((spl[0], spl[1], spl[2], '', '')) def process_token(self, token): if re.match(self.re_url, token): return TextProcessor.clean_url(self.expanded_urls.get( token, token)) t = token.lower() #t = token if t in self.stopwords or t in self.punctuation: return None if len(t) < 3 or t.startswith('@'): return None if not t.startswith('#'): t = t.translate({ord(k): "" for k in self.punctuation}) t = self.stemmer.stem(t) self.vocab[t].add(token) return t def tokenseq_generator(self): for text in self.corpus: yield self.tokenizer.tokenize(text)
from nltk.stem.snowball import PorterStemmer import food_detection_root import os import codecs stemmer = PorterStemmer() path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep what_food_list_file = codecs.open(path + "list - what_food.txt", encoding='utf-8') what_food_list = what_food_list_file.read().splitlines() stemmed_list = list() what_food_list_file.close() for word in what_food_list: stemmed_word = stemmer.stem(word) stemmed_list.append(stemmed_word) what_food_stemmed_list_file = codecs.open(path + "list - stemmed_what_food.txt", encoding='utf-8', mode='a') for word in stemmed_list: what_food_stemmed_list_file.write(word + "\n") what_food_stemmed_list_file.close()
from nltk.stem.snowball import SnowballStemmer ess = SnowballStemmer('english', ignore_stopwords=True) print(ess.stem('flies')) fss = SnowballStemmer('french', ignore_stopwords=True) print(fss.stem('courais')) from nltk.stem.snowball import PorterStemmer from nltk.stem.lancaster import LancasterStemmer print(ess.stem('teeth')) ps = PorterStemmer() print(ps.stem('teeth')) ls = LancasterStemmer() print(ls.stem('teeth')) print(ps.stem('teen')) print(ps.stem('teenager')) print(ls.stem('teen')) print(ls.stem('teenager')) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() vectorized_corpus = cv.fit_transform(imdb_df.review) print(vectorized_corpus.todense())
_20news = fetch_20newsgroups(subset="all") print("Dataset 20NEWS loaded...") data = _20news.data target = _20news.target ############################################################################### # Pre-process the dataset ############################################################################### print("Pre-processing the dataset...") stemmer = PorterStemmer() # Define the type of stemmer to use additional_stop_words = [ 'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization', 'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks', 'don', 'know', 'help', 'use', 'copy' ] stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words) stop_words = set([stemmer.stem(word) for word in stop_words ]) # Stem the stop words for larger detection processed_data = [] id_to_delete = [] for i, doc in enumerate(data): tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2)) stemmed_doc = [] for word in tokenized_doc: stemmed_word = stemmer.stem(word) if stemmed_word not in stop_words: stemmed_doc.append(stemmed_word) #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words] if stemmed_doc == []: # Empty document after pre-processing: to be removed id_to_delete.append(i) else: processed_data.append(stemmed_doc)
for corpus_idx in [1, 2, 3, 4, 5]: corpus_name = 'corpus'+str(corpus_idx) print('--------------------- Start Process %s ------------------------' % corpus_name) # initialize path corpus_path = retval + '/../corpus/' + corpus_name + '/' output_path = retval + '/../vector_model/' + corpus_name + '/' vocabulary_path = '/home/zhans/nltk_data/corpora/words/my_vocabulary' #nltk.corpus.words vocabulary_list = open(vocabulary_path, 'r').read().split() # set stemming or without stemming if stem_flag: stemmer = PorterStemmer() vocabulary_stem_list = [stemmer.stem(x) for x in vocabulary_list] vocabulary_list = list(set(vocabulary_stem_list)) vocabulary_length = len(vocabulary_list) # clean the raw text and generate a clean word list for each document corpus_clean = {} print('--------------------- Clean document ------------------------') for document_name in os.listdir(corpus_path): document_path = corpus_path + document_name document = open(document_path, 'r').read() doc_clean_word = clean_words(document, stem_flag) # save the clean word of document into a dictionary corpus_clean[document_name] = doc_clean_word print("Clean document done!") print('--------- Initial values for vector representation ----------')