def rmStopWords(l, lang): if lang == 'english': stops = set(get_stop_words('en')) if lang == 'catalan': stops = set(get_stop_words('ca')) if lang == 'spanish': stops = set(get_stop_words('es')) return [w for w in l if w not in stops]
def _get_stopwords(): """ Extracts stop-words from multiple sources for both russian and english languages """ all_stopwords = many_stop_words.get_stop_words('ru') all_stopwords.update(many_stop_words.get_stop_words('en')) more_stopwords = set(stopwords.words(['russian', 'english'])) all_stopwords.update(more_stopwords) return all_stopwords
def __init__(self, word_threshold=10): super().__init__() self.selected_replace_words = dict() self.stop_words = many_stop_words.get_stop_words("en") self.replaced_target_artifacts = None self.impacted_artifacts = None self.word_threshold = word_threshold
def get_expression_vector(strInput: str, embeddings: OrderedDict, targetWord=None) -> list: if language.lower() == 'japanese': stop = list(many_stop_words.get_stop_words('ja')) + list( string.punctuation) else: stop = stopwords.words(language) + list(string.punctuation) stop.append('<TRG>') if targetWord: stop.append(targetWord) def_tokens = [i for i in word_tokenize(strInput.lower()) if i not in stop] n_found_tokens = 0 expression_vector = np.array([0.0 for i in range(embeddings_dim)]) for token in def_tokens: if token in embeddings: n_found_tokens += 1 expression_vector += np.array(embeddings[token]) if n_found_tokens != 0: expression_vector = expression_vector / n_found_tokens return expression_vector.tolist() else: return None
def tagging(rawtekst): logger = open('logger.txt','a',encoding='UTF-8') stop_words = list(get_stop_words('pl')) tokenizer = RegexpTokenizer(r'\w+') if rawtext != None: d = tokenizer.tokenize(rawtekst.lower()) else: d= [] text = [i for i in d if i not in stop_words] parser = ListParser() stemmer = Morfologik() y = stemmer.stem(text, parser) lista = []git init for index,i in enumerate(y): rtext = str(y[index][1]).replace('[','').replace(']','').replace('}','').replace('{','').replace("'",'').split(':')[0] if rtext != '' and len(rtext)>1: lista.append(rtext) logger.write(rtext+', ') else: pass counts = Counter(lista) top10 = [] for index, i in enumerate(counts.most_common()): if index < 21: top10.append(i[0]) logger.write('\n') logger.close() return top10
def generate(self): self.ignored_words = set() stopwords_from_file = self.stopwords_file.read() for word in stopwords_from_file.split(): self.ignored_words.add(word) self.stopwords_file.close() self.ignored_words = set.union(many_stop_words.get_stop_words("ja"), self.ignored_words) longstring = "" if self.mask_img: mask = np.array(self.mask_img) else: mask = None amount_scs = 0 for superchat in self.sc_log: if superchat["message"]: amount_scs += 1 if '_' not in superchat["message"]: mecabbed = do_mecab(superchat["message"], '-Owakati') longstring += " " + mecabbed print("generating wordcloud from %d messages", amount_scs) STOPWORDS.update(self.ignored_words) wordcloud = WordCloud(font_path=self.font, collocations=False, background_color="white", width=1280, height=720, mask=mask).generate(longstring) if isinstance(self.logpath, Path): dest_image = self.target_dir + self.logpath.stem + "-wordcloud.png" else: dest_image = self.target_dir + self.logpath + "-wordcloud.png" wordcloud.to_file(dest_image)
def __init__(self, corpus_size): self.vector_size = 300 self.speller_obj = Speller(lang='en') self.stop_words = many_stop_words.get_stop_words("en") self.spacy_obj = spacy.load('en_core_web_sm') self.tokenizer_obj = Tokenizer(num_words=corpus_size, oov_token="<OOV>") with open("normalize_mapping.json") as normalize_file_obj: self.normalize_mapping = json.load(normalize_file_obj)
def __init__(self, min_cut=0.1, max_cut=0.9): """ Initilize the text summarizer. Words that have a frequency term lower than min_cut or higer than max_cut will be ignored. """ self._min_cut = min_cut self._max_cut = max_cut self._stopwords = set(get_stop_words('bn'))
def _summary(self, **kwargs): str_corpus = " ".join(self.answers).lower() words = re.sub(r"[^\w]", " ", str_corpus).split() stop_words = many_stop_words.get_stop_words( kwargs.get('language', 'en')) filtered_words = [word for word in words if word not in stop_words] summary_series = pd.Series(filtered_words).value_counts()[:20] summary_series.name = self.label return summary_series
def remove_stopwords(self, doc): for word in many_stop_words.get_stop_words("en"): lexeme = self.nlp.vocab[word] lexeme.is_stop = True doc = [ token.text for token in doc if token.is_stop != True and token.is_punct != True ] return doc
def get_stopwords(self) -> list: """ Get stopwords based on language. :params lang: language code. :return stop_list: list of stop words. """ return many_stop_words.get_stop_words(self._lang)
def delete_stop_words_from_list(l): stop_words = list(get_stop_words('ru')) # About 900 stopwords nltk_words = list(stopwords.words('russian')) # About 150 stopwords stop_words.extend(nltk_words) out = [] for x in l: if x[:x.find('_')] in stop_words: continue else: out.append(x) return out
def filter_words(words): new_words = FreqDist(words) stopwords = get_stop_words('ar') keys = new_words.keys() for word in keys: if word in stopwords: new_words.pop(word) if len(word) <= 2: new_words.pop(word) return new_words
def tokenize(body): tokens = word_tokenize(body) tokens = [w.lower() for w in tokens] tokens = [w for w in tokens if len(w) > 2] table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] stop_words = list(get_stop_words('nl')) nltk_words = list(stopwords.words('dutch')) stop_words.extend(nltk_words) words = [w for w in words if not w in stop_words] stemmer = SnowballStemmer("dutch") words = [stemmer.stem(word) for word in words] return words
def replace_word_in_targetArtifact(self, replace_list): replaced_artifact_tokens = [] for word, replacement in replace_list: for artif in self.targetArtifact: content = self.targetArtifact[artif] stop_words = many_stop_words.get_stop_words("en") for token in content.split(): token = token.lower() if token not in stop_words and len(token) >= 2: if token == word: replaced_artifact_tokens.append(replacement) else: replaced_artifact_tokens.append(token) return " ".join(replaced_artifact_tokens)
def tokenize_and_normalize_sentences(sentence, language=None, clean_http=True, debug=False): stemmer = LancasterStemmer() regex_set = regexEnJa().regex_en_ja_characters_set(whitespace=True, tabs_newlines=False, url=True) matches = re.finditer(regex_set, sentence, re.MULTILINE | re.IGNORECASE | re.VERBOSE | re.UNICODE) matches = [match.group() for match in matches] if debug: print('all matches') print(matches) if clean_http: matches = [x for x in matches if 'http' not in x] s = ''.join(matches) if debug: print('from: ', '<start>' + sentence + '<end>') print('='*100) print('to: ', '<start>' + s + '<end>') print('') if language: lang_code = language else: lang_code = detect_language_code(sentence) # set ignored words (overly common words) # tokenize words if lang_code == 'en': ignore_words = set(stopwords.words('english')) # english # nltk's word_tokenize for english words = english_tokenize(s) elif lang_code == 'ja': ignore_words = get_stop_words(lang_code) # has japanese words = mecab_tokenize(s) # clean blanks (japanese only) words = [w for w in words if w is not ' '] else: # todo: handle other languages properly # currently using english tokenizer as stand in ignore_words = set(stopwords.words('english')) # english # nltk's word_tokenize for english words = english_tokenize(s) root_words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] return root_words
def load_stop_words(): ''' Appends english stopwords from nltk, many_stop_words, custom stopwords from custom_stopwords.txt, and specific stopwords to a list STOP_WORDS ''' global STOP_WORDS STOP_WORDS = list(get_stop_words('en')) # About 900 stopwords nltk_words = list(stopwords.words('english')) # About 150 stopwords custom_stop_words = list(line.strip() for line in open('custom_stopwords.txt')) specific_stop_words = ['came', 'told', 'dont', 'outside', 'okay', 'ok', 'oh', 'really', 'never', 'everyone', 'went', 'sat', 'well', 'definitely'] STOP_WORDS.extend(nltk_words) STOP_WORDS.extend(custom_stop_words) STOP_WORDS.extend(specific_stop_words)
def filter_token_tag(tok_tag, stopwords=get_stop_words('ar')): allowed_tags = ['NN', 'DTNN', 'NNS','NNP','NNPS', 'JJ', 'JJR', 'JJS'] w, t = tok_tag if w in stopwords: return False if len(w) <= 2: return False try: if detect(w) != 'ar': return False except: return False if t not in allowed_tags: return False return True
def filter_token_tag(tok_tag, stopwords=get_stop_words('ar')): allowed_tags = ['NN', 'DTNN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'] w, t = tok_tag if w in stopwords: return False if len(w) <= 2: return False try: if detect(w) != 'ar': return False except: return False if t not in allowed_tags: return False return True
def read_all_stop_words() -> Set[str]: # Data source: https://wenku.baidu.com/view/7ca26338376baf1ffc4fad6a.html with open("data/chinese_stop_words.txt", mode="r", encoding="utf-8") as local_file: text_lines = local_file.readlines() text_lines = list(x.replace("\n", "") for x in text_lines) with open("data/chinese_stop_symbols.txt", mode="r", encoding="utf-8") as local_file: symbol_lines = local_file.readlines() symbol_lines = list(x.replace("\n", "") for x in symbol_lines) public_stop_words = get_stop_words("zh") stop_words: Set[str] = set() stop_words = stop_words.union(text_lines) stop_words = stop_words.union(symbol_lines) stop_words = stop_words.union(public_stop_words) return stop_words
def get_keywords(sentence, allowed_tags): sentence = _remove_by_regex(_replace_punct(sentence)) tokens = nltk.word_tokenize(sentence) tokens = [token.strip("'") for token in tokens] tagged_tokens = nltk.pos_tag(tokens) stop_words = get_stop_words('en') stop_words = {word.decode('utf-8') for word in stop_words} stop_words |= {'read'} keywords = [] for word, tag in tagged_tokens: word = word.lower() if is_proper_keyword(word, tag, allowed_tags, stop_words): keywords.append(word) bigrams_keywords = list(bigrams(keywords)) trigrams_keywords = list(trigrams(keywords)) for k in bigrams_keywords: keywords.append(' '.join(k)) for k in trigrams_keywords: keywords.append(' '.join(k)) return keywords
def generate(self): conn = psycopg2.connect(dbname=self.pgsql_creds["database"], user=self.pgsql_creds["username"], host=self.pgsql_creds["host"], password=self.pgsql_creds["password"]) cur = conn.cursor() cur.execute("SELECT message_txt FROM messages WHERE video_id = %s;", (self.video_id, )) results = cur.fetchall() conn.close() self.ignored_words = set() stopwords_from_file = self.stopwords_file.read() for word in stopwords_from_file.split(): self.ignored_words.add(word) self.stopwords_file.close() self.ignored_words = set.union(many_stop_words.get_stop_words("ja"), self.ignored_words) longstring = "" if self.mask_img: mask = np.array(self.mask_img) else: mask = None amount_scs = 0 for superchat in results: if superchat[0]: amount_scs += 1 if '_' not in superchat[0]: mecabbed = do_mecab(superchat[0], '-Owakati') longstring += " " + mecabbed print("generating wordcloud from %d messages", amount_scs) STOPWORDS.update(self.ignored_words) wordcloud = WordCloud(font_path=self.font, collocations=False, background_color="white", width=1280, height=720, mask=mask).generate(longstring) dest_image = self.target_dir + self.video_id + "-wordcloud.png" wordcloud.to_file(dest_image)
def find_topics(comments, quantity): tokenizer = RegexpTokenizer(r'\w+') #Load stop words list stop_words = list(stopwords.words('arabic')) stop_words.extend(set(get_stop_words('ar'))) #Stemmer definition p_stemmer = PorterStemmer() #Add comment to local list raw_data = [] raw_data.extend(comments) #List for tokenized texts texts = [] #Loop through raw texts for text in raw_data: #Clean and tokenize raw = text.lower() tokens = tokenizer.tokenize(raw) #Remove stop words from tokens stopped_tokens = [ i for i in tokens if not i in stop_words and len(i) > 4 ] #Stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] #Add tokens to final list texts.append(stemmed_tokens) #Turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) #Convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] #Generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=int(quantity), id2word=dictionary, passes=20) return (ldamodel.print_topics(num_topics=int(quantity), num_words=5))
def get_words_to_remove(): """ generate a list of words to remove for a better cleaning of tweets Returns: set : an array of words to remove """ punctuation = list(string.punctuation) stop_word_list_english = stopwords.words('english') stop_word_list_french = stopwords.words('french') others_words = [ 'rt', 'via', '...', '…', '»:', '«:', '’:', 'les', '-', ] words_to_remove = punctuation + stop_word_list_english + \ stop_word_list_french + others_words congo_words = { 'congo', 'congolais', 'rdc', 'drc', '-', 'https', 'rdcongo', 'drc', 'drcongo', } words_to_remove = set(words_to_remove).union(congo_words) words_to_remove = words_to_remove.union( set(many_stop_words.get_stop_words('fr'))) return words_to_remove
def filter_edges(edges, words): new_edges = [] stopwords = get_stop_words('ar') edges_word = FreqDist() max_e = 10 for e, w in edges: if e[0] in stopwords or e[1] in stopwords: continue if len(e[0]) <= 2 or len(e[1]) <= 2: continue if e[0] not in words and e[1] not in words: continue if edges_word[e[0]] >= max_e or edges_word[e[1]] >= max_e: continue new_edges.append((e,w)) edges_word[e[0]] += 1 edges_word[e[1]] += 1 return new_edges
def filter_edges(edges, words): new_edges = [] stopwords = get_stop_words('ar') edges_word = FreqDist() max_e = 10 for e, w in edges: if e[0] in stopwords or e[1] in stopwords: continue if len(e[0]) <= 2 or len(e[1]) <= 2: continue if e[0] not in words and e[1] not in words: continue if edges_word[e[0]] >= max_e or edges_word[e[1]] >= max_e: continue new_edges.append((e, w)) edges_word[e[0]] += 1 edges_word[e[1]] += 1 return new_edges
def test_get_two(): stop_words = get_stop_words('en', 'it') assert 'been' in stop_words # English assert 'buono' in stop_words # Italian assert 'bardzo' not in stop_words # Polish
def remove_stop_word(self, token_list, language="en", stop_words=None): if stop_words == None: if language == "ko": language = "kr" stop_words = many_stop_words.get_stop_words(language) return [x for x in token_list if x not in stop_words]
import json from ttp import ttp from nltk.corpus import stopwords import string import jsonpickle from many_stop_words import get_stop_words import time from datetime import datetime import pytz from _datetime import tzinfo from dateutil import parser # stop_corpus = set(stopwords.words('english')) #from konlpy.tag import Twitter; t = Twitter() stop_corpus = get_stop_words('kr') p = ttp.Parser() def remove_remaining_punctuations(text): for c in string.punctuation: text = text.replace(c, '') return text def remove_stop_words(text, stop_corpus): text = ' '.join([i for i in text.lower().split() if i not in stop_corpus]) #text = ' '.join([i for i in t.morphs(text) if i not in stop_corpus]) return text
def get_stopwords(language_code: str, extra_stopwords: {str}) -> {str}: available_languages = set(many_stop_words.available_languages) if language_code in available_languages: my_stopwords = many_stop_words.get_stop_words(language_code) return my_stopwords.union(extra_stopwords)
import string import dblogger from gensim import corpora import many_stop_words from nltk.tokenize import RegexpTokenizer from nltk.util import ngrams import regex as re import streamcorpus from streamcorpus_pipeline._clean_html import clean_html from streamcorpus_pipeline._clean_visible import clean_visible import yakonfig logger = logging.getLogger(__name__) stop_words = many_stop_words.get_stop_words() def find_soft_selectors(ids_and_clean_visible, start_num_tokens='10', max_num_tokens='20', filter_punctuation='0'): '''External interface for dossier.models.soft_selectors. This at scans through `num_tokens` values between `start_num_tokens` and `max_num_tokens` and calls `find_soft_selectors_at_n` looking for results All of the params can be passed from URL parameters, in which case they can be strings and this function will type cast them appropriately. ''' start_num_tokens = int(start_num_tokens)
foldernamelistNew = [] foldernamelistTitle = [] ListOfStagsPerFolder = [] Stemmer = PorterStemmer() def TagExtractionFuction(String): string = String for alltagsExtr in soup.find_all(string): titletagslines = [] tt = "".join(str(alltagsExtr)) soup2 = bs(tt, "html.parser") Ttag = "".join(str(soup2.text)) Ttags = re.sub(r'[\'\n]','', Ttag) TtagsWT = Ttags.split() #tokenizing for Twords in TtagsWT: #Twords = Twords.strip()#stopwords removing if Twords not in totalstopwords: #stopwords removing Twordss = re.sub(r'\\n', '',Twords) Twordss = Twordss.replace('\s', "") words2 = Stemmer.stem(Twordss) titletagslines.append(words2) titletagslines1.append(titletagslines) mystopwords = ['&','#','*','A','--','$','\\','_',"'n","'",'\\n',"', '","n't","'s","'\\n",' ',',','.','"','""',"''",'``',':','?','I','%','+','!','(',')','-',';','The'] stpw = list(get_stop_words('en')) totalstopwords = mystopwords + stpw for i in glob.glob("C:/Users/aa/.spyder/dataset/docs.with.sentence.breaks/*"): file = np.array("") print(file)
import string from unidecode import unidecode from nameparser import HumanName from enum import Enum from many_stop_words import get_stop_words from .author_names import AMBIGUOUS_NAMES punctuation_dict = str.maketrans({key: None for key in (string.punctuation)}) whitespace_dict = str.maketrans( {key: None for key in (string.whitespace.replace(" ", ""))}) ascii_dict = str.maketrans({key: None for key in (string.printable)}) suffix_list = ["jr", "jnr", "sr", "snr"] stop_word_list = get_stop_words("en") # TODO regex for latex and html def normalize_title(title, latex=False): # translate unicode characters to closest ascii characters name_split = title.replace("-", " ") ascii_decoded = unidecode(name_split) remove_punctuation = ascii_decoded.translate(punctuation_dict) remove_whitespace = remove_punctuation.translate(whitespace_dict) lowered = remove_whitespace.lower() only_one_space = lowered # by removing certain unicode characters we introduced multiple spaces, replace them by only on space while ' ' in only_one_space: only_one_space = only_one_space.replace(' ', ' ')
import re import nltk from many_stop_words import get_stop_words from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.stem.wordnet import WordNetLemmatizer from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer city = 'Kansas City' tweetsPath = 'TweetScraper\\TweetScraper\\Data\\' + city + '\\' dirPath = 'TweetScraper\\TweetScraper\\Data\\' stop_words = list(get_stop_words('en')) #About 900 stopwords nltk_words = list(stopwords.words('english')) #About 150 stopwords stop_words.extend(nltk_words) #LOAD DICTIONARIES with open(dirPath + 'pos_dictionary_lemmatized') as pdl: pos_dict_lem = pdl.read() pos_words_lem = pos_dict_lem.split(' ') with open(dirPath + 'neg_dictionary_lemmatized') as ndl: neg_dict_lem = ndl.read() neg_words_lem = neg_dict_lem.split(' ') with open(dirPath + 'pos_dictionary_stemmed') as pds: pos_dict_stem = pds.read() pos_words_stem = pos_dict_stem.split(' ')
# get token and corpus for kashi aimer = pickle.load(open("data/aimer.pickle", "rb")) dic_loc = "data/pn_ja.dic.txt" filter_pos = ["記号", "助詞", "助動詞", "接頭詞", "連体詞", "接続詞"] aimer_lyrics = list(map(lambda x: x[3], aimer.song_pack)) aimer_token = JaToken(aimer_lyrics, dic_loc, "1", filter_pos) stop_words = {"する", "られる", "さん", "てる", "ん","の", "dont", "こと", "よう", "まま", "そう", "あなた", "もの", "いつ", "いつか", "ため", "いる", "なる", "れる", "れる", "ない", "くい", "mum", "いい", "ほしい", "しまう", "ある", "くれる", "できる", "来る", "ゆく", "行く", "言う", "せる", "くる", "いく", "日々", "今日", "明日"} stop_words = many_stop_words.get_stop_words("ja", "en").union(stop_words) # noun analysis aimer_noun = word_by_pos(aimer_token, stop_words, "名詞", filtered_length = 0, most_common_show = 20) aimer_verb = word_by_pos(aimer_token, stop_words, "動詞", filtered_length = 0, most_common_show = 20) aimer_adj = word_by_pos(aimer_token, stop_words, "形容詞", filtered_length = 0, most_common_show = 20) aimer_all = word_by_pos(aimer_token, stop_words, "", filtered_length = 0, most_common_show = 20) aimerW = [pd.DataFrame(aimer_noun[3].most_common(), columns = ["word", "cnt"]), pd.DataFrame(aimer_verb[3].most_common(), columns = ["word", "cnt"]), pd.DataFrame(aimer_adj[3].most_common(), columns = ["word", "cnt"])] aimer_viz = [ aimerW, aimer_all, pd.DataFrame(aimer.song_pack, columns = ["title", "lyricist", "composer", "lyrics"])] pickle.dump(
def test_get_one(lang_code): stop_words = get_stop_words(lang_code) for word in stop_words: assert isinstance(word, unicode) assert u'\uFEFF' not in word assert len(stop_words) > 0
def test_get_all_basic(): stop_words = get_stop_words() assert 'if' in stop_words
def test_get_all_equals_getting_all(): assert get_stop_words() == get_stop_words(*available_languages)