def checkStopWordList(word, TOPIC): common_words = [s.lower() for s in TOPIC.split()] stopwords.update(common_words) if word.lower() in stopwords: return False else: return True
def preprocess(self): preprocessed_docs_tmp = self.documents preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp] preprocessed_docs_tmp = [ doc.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp ] stopwords = self.stopwords stopwords.update(self.new_stopwords) preprocessed_docs_tmp = [ ' '.join( [w for w in doc.split() if len(w) > 0 and w not in stopwords]) for doc in preprocessed_docs_tmp ] vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b') vectorizer.fit_transform(preprocessed_docs_tmp) vocabulary = set(vectorizer.get_feature_names()) preprocessed_docs_tmp = [ ' '.join([w for w in doc.split() if w in vocabulary]) for doc in preprocessed_docs_tmp ] preprocessed_docs, unpreprocessed_docs = [], [] for i, doc in enumerate(preprocessed_docs_tmp): if len(doc) > 0: preprocessed_docs.append(doc) unpreprocessed_docs.append(self.documents[i]) return preprocessed_docs, unpreprocessed_docs, list(vocabulary)
def process(self, inputs: ValueMap, outputs: ValueMap): stopwords = set() _languages = inputs.get_value_obj("languages") if _languages.is_set: all_stopwords = get_stopwords() languages: ListModel = _languages.data for language in languages.list_data: if language not in all_stopwords.fileids(): raise KiaraProcessingException( f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}." ) stopwords.update(get_stopwords().words(language)) _stopword_lists = inputs.get_value_obj("stopword_lists") if _stopword_lists.is_set: stopword_lists: ListModel = _stopword_lists.data for stopword_list in stopword_lists.list_data: if isinstance(stopword_list, str): stopwords.add(stopword_list) else: stopwords.update(stopword_list) outputs.set_value("stopwords_list", sorted(stopwords))
def parse_corpus(corpus): import nltk from nltk import wordpunct_tokenize from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer #from nltk.tag.stanford import StanfordNERTagger from nltk.tag import pos_tag import string import itertools """ commonWords = ['the', 'of', 'and', 'a', 'to', 'in', 'is', 'you', 'that', 'it', 'he', 'was', 'for', 'on', 'are', 'as', 'with', 'his', 'they', 'I', 'at', 'be', 'this', 'have', 'from', 'or', 'one', 'had', 'by', 'word', 'but', 'not', 'what', 'all', 'were', 'we', 'when'\ , 'your', 'can', 'said', 'there', 'use', 'an', 'each', 'which', 'she', 'do', 'how', 'their', 'if', 'will', 'up', 'other', 'about', 'out', 'many', 'then', 'them', 'these', 'so', 'some', 'her', 'would', 'make', 'like', 'him', 'into', 'time', 'has', 'look', 'two', 'mo\ re', 'write', 'go', 'see', 'number', 'no', 'way', 'could', 'people', 'my', 'than', 'first', 'water', 'been', 'call', 'who', 'oil', 'its', 'now', 'find', 'long', 'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'] listOfCharToExclude = ['.', ',', ':', '"', '+', '!', '?', '/', "'", '*', '(', ')', '$', '@', '&', '*',']','['] """ stopwords = set(stopwords.words('english')) stopwords.update(string.punctuation) stopwords.update([p[0] + p[1] for p in itertools.product(string.punctuation, string.punctuation)]) #stopwords.update(commonWords) #stopwords.update(listOfCharToExclude) #st = StanfordNERTagger('/home/orange63/TextMining/Project2/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz', # '/home/orange63/TextMining/Project2/stanford-ner-2014-06-16/stanford-ner-3.4.jar') porter = PorterStemmer() corpus_token = wordpunct_tokenize(corpus) corpus_token = [word for word in corpus_token if word not in stopwords] #corpus_no_people = [p[0] for p in filter(lambda x: x[1] != 'PERSON', st.tag(corpus_token))] corpus_NN = [p[0] for p in filter(lambda x: (x[1] == 'NN') or (x[1] == 'NNP'), pos_tag(corpus_token))] corpus_stem = [porter.stem(word) for word in corpus_NN] return ' '.join(corpus_stem)
def make_chunk(text): from nltk.corpus import stopwords from string import punctuation stopwords = set(stopwords.words('english') + list(punctuation)) stopwords.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '”', '“', "Advertisement" ]) print(text) f_tokens = [] #text = remove_stopwords(text) #text = normalise(text) tokens = nltk.tokenize.word_tokenize(text) for w in tokens: if w not in stopwords: f_tokens.append(w) pos = nltk.pos_tag(f_tokens) cp = nltk.RegexpParser(chunkGram) chunk_parser = nltk.RegexpParser(chunkGram) chunged = chunk_parser.parse(pos) # print(chunged) array = [] for subtree in chunged.subtrees(filter=lambda t: t.label() == "#Chunk"): array.append(" ".join([a for (a, b) in subtree.leaves()])) arr = nltk.FreqDist(array) key_chunk = arr.most_common(5) return key_chunk
def remove_english_stopwords(text): set(stopwords.words('english')) stopwords.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation list_of_words = [ i.lower() for i in wordpunct_tokenize(text) if i.lower() not in stopwords ] return list_of_words
def prepCloud(Topic_text, Topic): Topic = str(Topic).lower() Topic = ' '.join(re.sub('([^0-9A-Za-z \t])', ' ', Topic).split()) Topic = re.split("\s+", str(Topic)) stopwords = set(STOPWORDS) stopwords.update( Topic ) ### Add our topic in Stopwords, so it doesnt appear in wordClous ### text_new = " ".join( [txt for txt in Topic_text.split() if txt not in stopwords]) return text_new
def remove_stopwords(text): from nltk.corpus import stopwords from string import punctuation from nltk.tokenize import sent_tokenize, word_tokenize filt_text = [] stopwords = set(stopwords.words('english') + list(punctuation)) stopwords.update( ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) words = word_tokenize(text) for w in words: if w not in stopwords: filt_text.append(w) filter_str = ''.join(filt_text) return filter_str
def wordcloud(brand1,df1,brand2,df2): stopwords = set(STOPWORDS) tags=[] tags.extend(brand1.split()) tags.extend(brand2.split()) tags.extend(subStrings("".join(brand1.split()))) tags.extend(subStrings("".join(brand2.split()))) stopwords.update(tags) plt.figure(figsize=(10, 8), dpi=150) plt.subplot(2, 2, 1) pos = df1.loc[df1['sentiment_type'] == 'POSITIVE'] poswords=" ".join(word for word in pos.tweet if brand1 not in word) wordcloud1 = WordCloud(stopwords=stopwords).generate(poswords) plt.imshow(wordcloud1, interpolation='bilinear') plt.title(str.title(brand1)+' positive') plt.axis("off") plt.subplot(2, 2, 2) pos = df2.loc[df2['sentiment_type'] == 'NEGATIVE'] poswords=" ".join(word for word in pos.tweet if brand2 not in word) wordcloud2 = WordCloud(stopwords=stopwords).generate(poswords) plt.imshow(wordcloud2, interpolation='bilinear') plt.title(str.title(brand2)+' negative') plt.axis("off") plt.subplot(2, 2, 3) pos = df1.loc[df1['sentiment_type'] == 'NEGATIVE'] poswords=" ".join(word for word in pos.tweet if brand1 not in word) wordcloud3 = WordCloud(stopwords=stopwords).generate(poswords) plt.imshow(wordcloud3, interpolation='bilinear') plt.title(str.title(brand1)+' negative') plt.axis("off") plt.subplot(2, 2, 4) pos = df2.loc[df2['sentiment_type'] == 'POSITIVE'] poswords=" ".join(word for word in pos.tweet if brand2 not in word) wordcloud4 = WordCloud(stopwords=stopwords).generate(poswords) plt.imshow(wordcloud4, interpolation='bilinear') plt.title(str.title(brand2)+' positive') plt.axis("off") plt.savefig('cloud.png') mediaobj = anvil.media.from_file('cloud.png') return mediaobj
def tokenize_text(self, text): tokens = [] # Adding to stopwords stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() stopwords.update(spanish) stopwords.update(['http', 'f**k', 'rt']) for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): # if word not in stopwords: if len(word) < 2: continue # tokens.append(self._lemmatize_stemming(word.lower())) tokens.append(word.lower()) return tokens
def process(self, inputs: ValueMap, outputs: ValueMap) -> None: import pyarrow as pa custom_stopwords = inputs.get_value_data("additional_stopwords") if inputs.get_value_obj("languages").is_set: _languages: ListModel = inputs.get_value_data("languages") languages = _languages.list_data else: languages = [] stopwords = set() if languages: for language in languages: if language not in get_stopwords().fileids(): raise KiaraProcessingException( f"Invalid language: {language}. Available: {', '.join(get_stopwords().fileids())}." ) stopwords.update(get_stopwords().words(language)) if custom_stopwords: stopwords.update(custom_stopwords) orig_array = inputs.get_value_obj("tokens_array") # type: ignore if not stopwords: outputs.set_value("tokens_array", orig_array) return # if hasattr(orig_array, "to_pylist"): # token_lists = orig_array.to_pylist() tokens_array = orig_array.data.arrow_array # TODO: use vaex for this result = [] for token_list in tokens_array: cleaned_list = [ x for x in token_list.as_py() if x.lower() not in stopwords ] result.append(cleaned_list) outputs.set_value("tokens_array", pa.chunked_array(pa.array(result)))
class TwitterUser: auth = tweepy.AppAuthHandler(Config.CONSUMER_KEY, Config.CONSUMER_SECRET) #Construct the API instance api = tweepy.API(auth, wait_on_rate_limit=True) # create an API object # create tokenizer that gets words (only alphabetical words) tokenizer = RegexpTokenizer(r'\w+') stopwords = set(stopwords.words('english')) stopwords.update({"https", "http"}) def __init__(self, handle: str): self.handle = handle self.user = self.api.get_user(handle) self.bio = self.user.description self.timeline = self.user.timeline(count=200) self.num_of_tweets = len(self.timeline) self.stopwords.add(handle) def get_top_words(self, limit: int = 10, word_len_min: int = 2) -> list: ''' Return top common words from tweets ''' all_words = list() for tweet in self.timeline: # print(tweet.text, "\n-----\n\n\n") words = self.tokenizer.tokenize(tweet.text) for word in words: if len(word) > word_len_min and word not in self.stopwords: all_words.append(word.lower()) word_distribution = nltk.FreqDist(all_words) top_words = word_distribution.most_common(limit) return top_words ''' We can cross reference the text''' def get_top_hastags(self, limit: int = 10, word_len_min: int = 2) -> list: ''' Return top common hashtags from tweets ''' all_words = list() for tweet in self.timeline: # print(tweet.text, "\n-----\n\n\n") words = re.findall(r"(\#[a-zA-Z]+\b)(?!;)", tweet.text) for word in words: if len(word) > word_len_min and word not in self.stopwords: all_words.append(word.lower()) word_distribution = nltk.FreqDist(all_words) hashtags = word_distribution.most_common(limit) return hashtags
def main(): from nltk.corpus import stopwords stopwords = set(stopwords.words('polish')) stopwords.update(['zgłoś', 'naruszenie', 'wczytuję', 'działam']) for i in range(100): stopwords.update(np.arange(0, 100)) np.random.seed(0) fNames = readTags() numberLabels = {} for i in range(len(fNames)): numberLabels[fNames[i]] = i dataSet = open("dataSet.csv", 'w') model = Word2Vec.load('embeddings.bin') for name in fNames: print('creating data set for tag:', name) ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) allArticlesForTag = glob.glob(ROOT_DIR + '/texts/' + name + '/*.txt') for article in allArticlesForTag: with open(article) as f: text = '' for line in f: if len(line) > 200: text += line postVecs = [] if len(text) > 0: if len(text) > 2000: texts = re.split('\s{4,}', text) for i in texts: if len(i) > 50: postVecs.append( postsToAveEmbeddings(i, model, stopwords)) else: postVecs.append( postsToAveEmbeddings(text, model, stopwords)) for v in postVecs: for x in v: dataSet.write(str(x) + ',') dataSet.write(str(numberLabels[name]) + '\n') dataSet.close()
def word_cloud(source_name, subjectivity_floor): list_words = all_words.word[(all_words.subjectivity < subjectivity_floor)] df2 = final_df[(final_df.source_name == source_name)] all_text = [] for blob in df2.text: all_text.append(blob) corpus = '-'.join(all_text) corpus = corpus.lower() stopwords = set(STOPWORDS) stopwords.update(list_words) # Generate a word cloud image wordcloud = WordCloud(background_color="white", stopwords=stopwords).generate(corpus) # Display the generated image: plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()
def content_text(text): #Remove common stopwords including "he's" and "she's" stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.update(("He's", "She's")) with_stp = Counter() without_stp = Counter() with open(text) as f: for line in f: spl = line.split() # update count off all words in the line that are in stopwrods with_stp.update(w.lower().rstrip(punctuation) for w in spl if w.lower() in stopwords) # update count off all words in the line that are not in stopwords without_stp.update(w.lower().rstrip(punctuation) for w in spl if w not in stopwords) # return a list with top ten most common words from each return [x for x in with_stp.most_common(10) ], [y for y in without_stp.most_common(10)]
class NLProcessor: stemmer = PorterStemmer() stopwords = set(stopwords.words('english')) stopwords.update([ '.', ',', '.,', '-', '–', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}' ]) stopwords.update(['$', '%', '#', '/', '‘', '’', '“', '”', '·', '`']) @staticmethod def split_sentence(sentence: str): return wordpunct_tokenize(sentence) @staticmethod def norm_word(word: str): word = word.lower() if word in NLProcessor.stopwords: return None return NLProcessor.stemmer.stem(word)
def create_word_cloud(text_vec): text = " ".join(review for review in text_vec) #print ("There are {} words in the combination of all review.".format(len(text))) # Create stopword list: stopwords = set(STOPWORDS) stopwords.update(["my", "trade"]) # Generate a word cloud image wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text) # Display the generated image: # the matplotlib way: # plt.figure(figsize = (8, 8), facecolor = None) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.tight_layout(pad = 0) # plt.show() return wordcloud, get_vader_sentiment(text), get_textblob_sentiment(text)
def countMostCommonWords(): from nltk.corpus import stopwords file = open('analysis.txt', 'a') stopwords = set(stopwords.words('polish')) stopwords.update( ['zgłoś', 'naruszenie', '1', '2', '3', '4', '5', '6', '7', '8', '9']) fNames = readTags() for fName in fNames: print("\nTag:", fName) content = '' ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) allArticlesForTag = glob.glob(ROOT_DIR + '/texts/' + fName + '/*.txt') for article in allArticlesForTag: with open(article) as f: content += f.read() content = content.lower() tokenizer = nltk.RegexpTokenizer(r'\w+') content = tokenizer.tokenize(content) mostCommon = mostCommonWords(content, stopwords) file.write(fName + ': ') for word in mostCommon: file.write(word + ' ') file.write('\n')
def wordcloud(df, group): matplotlib.use('Agg') text = " ".join(review for review in df.review) print("There are {} words in the combination of all review.".format( len(text))) # Create stopword list: stopwords = set(STOPWORDS) stopwords.update(["room", "hotel", "desk", "Chicago", "stay", "day"]) # Generate a word cloud image wordcloud = WordCloud(stopwords=stopwords, collocations=False, max_words=100, background_color="white").generate(text) # Display the generated image: # the matplotlib way: plt.figure(figsize=(10, 8)) plt.title("Word Cloud for " + group + ' reviews', fontsize=18) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") img_title = "Word Cloud for " + group + ' reviews' plt.savefig(img_title + '.png')
from sklearn.feature_extraction import stop_words import random def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs): return "hsl(0, 0%%, %d%%)" % random.randint(60, 100) stopwords = set(STOPWORDS) newStopWords = ['job_querry', 'description', 'contrat', 'city'] stopwords.update(newStopWords) wordcloud = WordCloud( background_color='black', stopwords=stopwords, max_words=1500, max_font_size=200, width=1000, height=600, random_state=42, ).generate(" ".join(df['title'].astype(str))) fig = plt.figure(figsize=(12, 12)) plt.imshow(wordcloud.recolor(color_func=grey_color_func, random_state=3), interpolation="bilinear") plt.title("WORD CLOUD title", fontsize=25)
text = open('Building_Global_Community.txt').read() norm_text = text.lower() import nltk nltk.download('stopwords') from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) import string stopwords.update(string.punctuation) from nltk import wordpunct_tokenize words = wordpunct_tokenize(norm_text) filtered_words = [word for word in words if word not in stopwords if word.isalpha()] from collections import Counter counter = Counter(filtered_words) for word, count in counter.most_common(20): print("%s: %d" % (word, count))
def stopwords_cleaned(sentence): res = [] for word in sentence: if word not in stopwords: res.append(word) return res df['tokenized_removed_stopwords'] = df['tokenized_sentences'].apply( stopwords_cleaned) from wordcloud import WordCloud, STOPWORDS text_words = '' stopwords = set(STOPWORDS) stopwords.update([ "br", "href", "amazon", "food", "gp", "ve", "grocery", "store", "although", "suscribe", "though", "think", "thought", "maybe" ]) #wordcloud text = " ".join(review for review in df.Text) wordcloud = WordCloud(width=800, height=800, background_color='black', stopwords=stopwords, min_font_size=10).generate(text) plt.figure(figsize=(8, 8), facecolor=None) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.tight_layout(pad=0)
list_join_wd_permutations = set() #make list of mwe tokenizers joined by space mwe_tokenizer = MWETokenizer(separator=' ') #dictionary to count the food words counter = dict() #total food count total = 0 #smallword_list = ['of', 'the', 'and', 'out', 'na', 'vit', 'n'] stopwords = set(stopwords.words('english')) #add more words to stopwords list stopwords.update(['n', 'na', 'new', 'vit', 'style', 'low', 'sprd', 'it\'s', 'dried', 'fungi', 'wonder', 'one', 'tongue', 'flavor', 'flavors', 'w', 'always', 'made', 'vegan', 'white', 'good', 'little', 'go', 'eye', 'end', 'delight', 'cloud', 'blue', 'back', 'without', 'warm', 'stuff', 'skin', 'right', 'real', 'past', 'outside', 'next', 'morning', 'hi', 'heart', 'head', 'gold', 'general', 'fr', 'eat', 'drink', 'big', 'baby', 'way', 'use', 'ultra', 'super', 'sub', 'start', 'soft', 'si', 'shaped', 'power', 'plus', 'part', 'old', 'november', 'mixed', 'meal', 'less', 'late', 'kit', 'game', 'kit', 'friends', 'eight', 'dog', 'deep', 'de', 'combination', 'blends', 'bear', 'animal', 'add', 'ear', 'kid', 'boy', 'oh', 'top', 'tree', 'side', 'shapes', 'prior', 'neck', 'mix', 'french', 'food', 'balls', 'young', 'wld', 'wash', 'user', 'types', 'type', 'store', 'southern', 'smart', 'slices', 'pods', 'plate', 'party', 'ones', 'lunch', 'leg', 'label', 'jo', 'item', 'inch', 'iced', 'higher', 'half', 'giant', 'g', 'foods', 'filling', 'filled', 'family', 'eyes', 'es', 'energy', 'dove', 'dogs', 'cup', 'cubes', 'cooking', 'child', 'christmas', 'character', 'ch', 'box', 'bowl', 'boo', 'black', 'bite', 'bar', 'b', 'arizona', 'rl', 'r', 'butt', 'mr', 'mt', 'pm', 'post', 'ross', 'x', 'touch', 'well', 'life', 'long', 'great', 'covered', 'year', 'spread', 'mini', 'straight', 'feet', 'weed', 'sea', 'sweet', 'fluffy', 'healthy', 'treats', 'light', 'snacks', 'fat', 'pink', 'cool', 'sunshine', 'rainbow', 'rolled', 'louis', 'sun', 'ground', 'mixture', 'home', 'full', 'summer', 'stars', 'star', 'recipe', 'la', 'ocean', 'hawaiian', 'chick', 'bright', 'blood', 'bit', 'bamboo', 'yellow', 'wrapped', 'women', 'winter', 'whole', 'vitamin', 'thick', 'smoked', 'slip', 'slice', 'silver', 'silk', 'serving', 'seeded', 'savory', 'restaurant', 'red', 'quick', 'quarters', 'proof', 'pound', 'popeyes', 'pockets', 'pillow', 'oscar', 'original', 'non', 'necks', 'moist', 'mediterranean', 'japanese', 'inside', 'includes', 'heat', 'hand', 'green', 'fun', 'form', 'done', 'art', 'adventure','break','beach','base','balance', 'brisk', 'baking']) #stopwords below that are words part of multi word foods, stops only the individual words, not the multi word stopwords2 = ['mashed', 'mash', 'colada', 'hot', 'whipped', 'whip', 'cold', 'fresh', 'vanilla ice', 'purple', 'moose', 'fried', 'edible', 'roll', 'rolls', 'wedding', 'soy', 'peanut', 'pop', 'homemade', 'blueberry', 'almond', 'vegetable', 'tap', 'tuna', 'sugared', 'straw', 'stone', 'spring', 'shake', 'sauce', 'rose', 'roasted', 'ricotta', 'puffed', 'powdered', 'pina', 'olive', 'oil', 'oat', 'mineral', 'maple', 'liquid', 'joy', 'grilled', 'greens', 'golden', 'goddess', 'glazed', 'frosted', 'fiji', 'dry', 'drop', 'double', 'dinner', 'desser', 'dark', 'cut', 'crunchy', 'crunch', 'crisp', 'chinese', 'chews', 'cheesy', 'buttery', 'bright', 'brand', 'bliss', 'apple', 'alcoholic','wasabi', 'cotton', 'lucky', 'seeds', 'flakes', 'tropical','chicken','drops', 'buds', 'bud', 'bakery', 'bites', 'cliff', 'coconut cotton', 'breakfast'] for food in list_of_report: description = food["Description"].lower() #converts everything to lower case tokens = word_tokenize(description) words = [word for word in tokens if word.isalpha() and word not in stopwords] #iterates through the food descriptions for word in words: #if food description contains certain words generate permutations of the description and append to the list
jsonArray = json_data.get("value") # 총 5433개의 문서 jsonArray1 = list({v['title']: v for v in jsonArray}.values()) # title로 구분하여 중복 제거 #print(jsonArray) print(len(jsonArray)) print(len(jsonArray1)) wlem = nltk.WordNetLemmatizer() #lemmatized_words = [] # Stop Words 등록하기 stopwords = set(STOPWORDS) stopwords.update([ 'system', 'service', 'paper', 'software', 'business', 'process', 'information' ]) # print(stopwords) # 불용어 출력 year2002 = [] year2003 = [] year2004 = [] year2005 = [] year2006 = [] year2007 = [] year2008 = [] year2009 = [] year2010 = [] year2011 = [] year2012 = [] year2013 = []
# wiesliam # these should be the only imports you need import tweepy import nltk nltk.download('averaged_perceptron_tagger') import json import sys from nltk.corpus import stopwords stopwords = set(stopwords.words("english")) stopwords.update(["http", "https", "RT"]) # write your code here # usage should be python3 part1.py <username> <num_tweets> #print("hey") username = sys.argv[1] num_tweets = sys.argv[2] def iterate(tagged_words, tag): #print(tagged_words, tag) relevant_terms = {} for term in tagged_words: #print(term[1]) if term[1][:2] == tag: #print(term[1]) if term[0] not in relevant_terms: relevant_terms[term[0]] = 1 else: relevant_terms[term[0]] = relevant_terms[term[0]] + 1 sorted_words = sorted(relevant_terms.items(),
from collections import Counter from nltk.corpus import stopwords from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator d = path.dirname(__file__) # Read the whole text. text = open(path.join(d, 'mydissertation.txt')).read() wordcount = Counter(text.split()) # read the mask / color image taken from the image grad_coloring = np.array(Image.open(path.join(d, "Graduation-cap-blue.jpg"))) stopwords = set(STOPWORDS) stopwords.update( ('Figure', 'based', 'et', 'al', 'Therefore', 'used', 'using', 'show', 'shown' 'and', 'I', 'A', 'And', 'So', 'arnt', 'This', 'When', 'It', 'many', 'Many', 'so', 'cant', 'Yes', 'yes', 'No', 'no', 'These', 'these')) filtered_words = [word for word in text.split() if word not in stopwords] filterwordcount = Counter(filtered_words) filterwordcount.most_common(1) wc = WordCloud(background_color="white", max_words=1000, mask=grad_coloring, stopwords=stopwords, max_font_size=40, random_state=42) # generate word cloud wc.generate(text)
import csv import pandas as pd from pandas import Series, DataFrame from sklearn import cluster, datasets, metrics from sklearn.cluster import SpectralClustering pathProg = 'C:\\Python27' os.chdir(pathProg) stopwords = set(stopwords.words('english')) import string cc = string.punctuation dd = '--' for symbol in cc: stopwords.add(symbol) stopwords.update(cc) stopwords.add("--") stopwords.add("'s") stopwords.add("'ve") stopwords.add("'re") stopwords.add("n't") stopwords.add("``") stopwords.add("''") bb = [] file = open(pathProg + '/building_global_community.txt', 'r') f = file.read() bb = f.lower() bb = word_tokenize(bb)
#PROSSESING DONE //////////////////////////////// def rejoin_words(row): my_list = row['lemmatized_words'] joined_words = (" ".join(my_list)) return joined_words df['processed'] = df.apply(rejoin_words, axis=1) from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator stopwords = set(STOPWORDS) stopwords.update([ 'we', 'will', 'aren', 'couldn', 'didn', 'doesn', 'don', 'hadn', 'dont', 'doesnt', 'cant', 'couldnt', 'couldve', 'im', 'ive', 'isnt', 'theres', 'wasnt', 'wouldnt', 'a', 'also', 'like', 'hasn', 'haven', 'isn', 'let', 'll', 'mustn', 're', 'shan', 'shouldn', 've', 'wasn', 'weren', 'won', 'wouldn', 'ha', 'wa', 'ldnont' ]) #VECTORIZING //////////////////////////////// bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words=stopwords) bow = bow_vectorizer.fit_transform(df['processed']) top_sum = bow.toarray().sum(axis=0) top_sum_cv = [top_sum] columns_cv = bow_vectorizer.get_feature_names() x_traincvdf = pd.DataFrame(top_sum_cv, columns=columns_cv) tfidf_vectorizer = TfidfVectorizer(max_df=0.90,
#!/usr/bin/env python from os import path import os import nltk import matplotlib.pyplot as plt from wordcloud import WordCloud #, STOPWORDS # Descarga un listado de stopwords en distintos idiomas # en este caso, español; se pueden usar varios diccionarios nltk.download('stopwords') from nltk.corpus import stopwords stopwords = set(stopwords.words('spanish')) stopwords.update(["Media","omitted","1","2","3","4","5","6","7","8","9","0","/","AM","PM","-"]) #stopwords.discard("qué") # Eilimina una palabra de las stopwords para que sea tenida en cuanta # get data directory (using getcwd() is needed to support running example in generated IPython notebook) d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() # Read the whole text. text = open(path.join(d, 'compilado.txt'),encoding="utf-8").read() # Genera el wordcloud con parámetros adicionales disponibles en la documentación wc = WordCloud(background_color="white",max_words=500,width=4000,height=2000,repeat=False, stopwords=stopwords,contour_width=3,contour_color='steelblue') # generate word cloud wc.generate(text) # store to file wc.to_file(path.join(d, "output.png"))
from nltk.stem.snowball import SnowballStemmer from nltk import word_tokenize from nltk import pos_tag from nltk import FreqDist #input from user test_num=0 test_num = input( "Input the number of Research articles to be classified:") testpaths = [] for i in range (test_num): test=str(raw_input("Enter path:")) testpaths.append(test) ####################################Pre processing Test Data ##################################################### stopwords = set(stopwords.words('english')) stopwords.update([',','.','?','!','}','(',')',']','[','=','|','*','0','.',':',';','@','^','%','$','+','_','-','9','8','7','6','1','2','3','4','5','*']) num_classes=5 for i in range(test_num): with codecs.open(testpaths[i],'r',encoding='utf8') as file: text1 = file.read() text1 = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text1) taggedlist = pos_tag(text1) for w in taggedlist: if w[1]=='PRP' or w[1] == 'DT' or w[1]=='CC': text1 = text1.replace(" "+w[0]+" ","") tokens = word_tokenize(text1) tokens = [x.encode('UTF8') for x in tokens]
text = [] compoundscore = [] sid = SentimentIntensityAnalyzer() track_name = list(dict.fromkeys(track_name)) for i in range(len(track_name)): song = genius.search_song(track_name[i], artist_name[i]) songlyrics = song.lyrics.replace("\n", " ").replace("\\'", "\'") lyrics[track_name[i]] = songlyrics songlyrics = songlyrics.replace('(', '').replace(')', '') text.append(re.sub("[\\[].*?[\\]]", "", songlyrics)) scores = sid.polarity_scores(text[i]) compoundscore.append(scores['compound']) text = ' '.join(map(str, text)) print(text.encode("utf-8")) stopwords = set(stopwords.words('english')) stopwords.update(["br", "href", "la", "yeah", "yuh", "wan", "i'm"]) sentences = sent_tokenize(text) words = word_tokenize(text) words_no_punc = [] for w in words: if w.isalpha(): words_no_punc.append(w.lower()) ps = PorterStemmer() clean_words = [] for w in words_no_punc: if w not in stopwords: clean_words.append(ps.stem(w)) fdist = FreqDist(clean_words) print(fdist.most_common(10))