def create_global_topic_list(articleList): e = re.compile(r"\s(de)\s") u = re.compile(r"\s(du)\s") globalTopicList = [] i = 0 for commList in articleList.values(): # Article body + all comments art = commList[0].artBody for comm in commList: art += comm.body # Global list of named entities art = u.sub(" Du ", art) art = e.sub(" De ", art) entities = extract_entities(wordpunct_tokenize(art)) globalTopicList += entities i += 1 if i % 100 == 0: print i,"comments processed for global vector" globalTopicList = nltk.FreqDist(globalTopicList) tempVector = dict() for k in globalTopicList.items()[:100]: tempVector[k[0]] = 0 f = open("globalTopics" + '.pkl', 'wb') pickle.dump(tempVector, f, pickle.HIGHEST_PROTOCOL) f.close()
def preprocessText(movie_id): doc = readPlot(movie_id) stopset = set(stopwords.words('english')) stemmer = SnowballStemmer('english',ignore_stopwords=True) tokens = wordpunct_tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2] stemmed_text = [stemmer.stem(word) for word in clean] return stemmed_text
def get_text_words(text, stopwords=sw): text = preprocess_text(text) user_set = set(["http", "://"]) text_words = set(wordpunct_tokenize(text.lower())) text_words = text_words.difference(stopwords) text_words = text_words.difference(user_set) text_words = [w for w in text_words if len(w) > 2] return text_words
def get_tweet_words(_tweet, stopwords = []): tweet = preprocess_tweet(_tweet) user_set = set(["http", "://"]) tweet_words = set(wordpunct_tokenize(tweet.lower())) tweet_words = tweet_words.difference(stopwords) tweet_words = tweet_words.difference(user_set) tweet_words = [w for w in tweet_words if len(w)>2] return tweet_words
def word_indicator(text, **kwargs): if CLASSIFIER == 'MultinomialNB': features = dict(Counter(wordpunct_tokenize(text.lower()))) for el in features.keys(): if el in sw or el in ["http", "://"]: del features[el] else: features = defaultdict(list) text_words = get_text_words(text, **kwargs) for w in text_words: features[w] = True return features
def word_indicator(text, **kwargs): if CLASSIFIER == "MultinomialNB": features = dict(Counter(wordpunct_tokenize(text.lower()))) for el in features.keys(): if el in sw or el in ["http", "://"]: del features[el] else: features = defaultdict(list) text_words = get_text_words(text, **kwargs) for w in text_words: features[w] = True return features
def get_clean_text_tokens(in_text): "returns a list of lemmatised tokens, after cleaning in_text from stopwords, numbers, and one-letter words" stop_words = set(stopwords.words("english")) tokens = [t for t in wordpunct_tokenize(in_text)] tokens = filter(lambda x: x not in stop_words, tokens) tokens = filter(lambda x: x.isalpha(), tokens) tokens = filter(lambda x: len(x) > 1, tokens) wordnet_lemmatizer = WordNetLemmatizer() lemmatized_tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] return lemmatized_tokens
def tokenize(text): ret = [] last_offset = 0 if not text: return ret for token in wordpunct_tokenize(text): processed_token = token.lower().strip() if not processed_token or len(processed_token) < 3 or is_stop_word( token): continue processed_token = lemmatize_word(processed_token) last_offset = text.find(token, last_offset) ret.append((processed_token, last_offset)) return ret
def tokenize(x): return [w for w in wordpunct_tokenize(x) if len(w)>=3]
def inject_term_list(in_lemma_list, in_text, lemmatizer): "This function replace the term with its lemma, prefixed by an underscore, if the term is included in in_lemma_list" return ' '.join([lemma_replace(in_lemma_list, t, lemmatizer).encode("utf-8") for t in wordpunct_tokenize(in_text)])