Python get_stop_words示例，many_stop_words.get_stop_words Python示例

示例#1

0

显示文件

文件： B.py 项目： suttonbm/umich_NLP

def rmStopWords(l, lang):
    if lang == 'english':
        stops = set(get_stop_words('en'))
    if lang == 'catalan':
        stops = set(get_stop_words('ca'))
    if lang == 'spanish':
        stops = set(get_stop_words('es'))
    return [w for w in l if w not in stops]

示例#2

0

显示文件

文件： NormalizeTitlesTask.py 项目： putenkrab/code_examples

def _get_stopwords():
    """
    Extracts stop-words from multiple sources for
    both russian and english languages
    """
    all_stopwords = many_stop_words.get_stop_words('ru')
    all_stopwords.update(many_stop_words.get_stop_words('en'))

    more_stopwords = set(stopwords.words(['russian', 'english']))
    all_stopwords.update(more_stopwords)

    return all_stopwords

示例#3

0

显示文件

文件： analyze_experiment.py 项目： ceparadise/IntrermingualLang

 def __init__(self, word_threshold=10):
     super().__init__()
     self.selected_replace_words = dict()
     self.stop_words = many_stop_words.get_stop_words("en")
     self.replaced_target_artifacts = None
     self.impacted_artifacts = None
     self.word_threshold = word_threshold

示例#4

0

显示文件

def get_expression_vector(strInput: str,
                          embeddings: OrderedDict,
                          targetWord=None) -> list:
    if language.lower() == 'japanese':
        stop = list(many_stop_words.get_stop_words('ja')) + list(
            string.punctuation)
    else:
        stop = stopwords.words(language) + list(string.punctuation)

    stop.append('<TRG>')
    if targetWord:
        stop.append(targetWord)

    def_tokens = [i for i in word_tokenize(strInput.lower()) if i not in stop]
    n_found_tokens = 0
    expression_vector = np.array([0.0 for i in range(embeddings_dim)])

    for token in def_tokens:
        if token in embeddings:
            n_found_tokens += 1
            expression_vector += np.array(embeddings[token])

    if n_found_tokens != 0:
        expression_vector = expression_vector / n_found_tokens
        return expression_vector.tolist()
    else:
        return None

示例#5

0

显示文件

文件： cloud.py 项目： spsp01/wp_news_nltk

def tagging(rawtekst):
    logger = open('logger.txt','a',encoding='UTF-8')
    stop_words = list(get_stop_words('pl'))
    tokenizer = RegexpTokenizer(r'\w+')
    if rawtext != None:
        d = tokenizer.tokenize(rawtekst.lower())
    else:
        d= []
    text =  [i for i in d if i not in stop_words]
    parser = ListParser()
    stemmer = Morfologik()
    y = stemmer.stem(text, parser)
    lista = []git init
    for index,i in enumerate(y):
        rtext = str(y[index][1]).replace('[','').replace(']','').replace('}','').replace('{','').replace("'",'').split(':')[0]
        if rtext != '' and len(rtext)>1:
            lista.append(rtext)
            logger.write(rtext+', ')
        else:
            pass
    counts = Counter(lista)
    top10 = []
    for index, i in enumerate(counts.most_common()):
        if index < 21:
           top10.append(i[0])
    logger.write('\n')
    logger.close()
    return top10

示例#6

0

显示文件

 def generate(self):
     self.ignored_words = set()
     stopwords_from_file = self.stopwords_file.read()
     for word in stopwords_from_file.split():
         self.ignored_words.add(word)
     self.stopwords_file.close()
     self.ignored_words = set.union(many_stop_words.get_stop_words("ja"),
                                    self.ignored_words)
     longstring = ""
     if self.mask_img:
         mask = np.array(self.mask_img)
     else:
         mask = None
     amount_scs = 0
     for superchat in self.sc_log:
         if superchat["message"]:
             amount_scs += 1
             if '_' not in superchat["message"]:
                 mecabbed = do_mecab(superchat["message"], '-Owakati')
                 longstring += " " + mecabbed
     print("generating wordcloud from %d messages", amount_scs)
     STOPWORDS.update(self.ignored_words)
     wordcloud = WordCloud(font_path=self.font,
                           collocations=False,
                           background_color="white",
                           width=1280,
                           height=720,
                           mask=mask).generate(longstring)
     if isinstance(self.logpath, Path):
         dest_image = self.target_dir + self.logpath.stem + "-wordcloud.png"
     else:
         dest_image = self.target_dir + self.logpath + "-wordcloud.png"
     wordcloud.to_file(dest_image)

示例#7

0

显示文件

文件： disaster_tweet_classifier.py 项目： UtkarshSarswat/nlp_disaster

 def __init__(self, corpus_size):
     self.vector_size = 300
     self.speller_obj = Speller(lang='en')
     self.stop_words = many_stop_words.get_stop_words("en")
     self.spacy_obj = spacy.load('en_core_web_sm')
     self.tokenizer_obj = Tokenizer(num_words=corpus_size,
                                    oov_token="<OOV>")
     with open("normalize_mapping.json") as normalize_file_obj:
         self.normalize_mapping = json.load(normalize_file_obj)

示例#8

0

显示文件

文件： statistical_approach.py 项目： CREVIOS/My-Reserach

 def __init__(self, min_cut=0.1, max_cut=0.9):
   """
    Initilize the text summarizer.
    Words that have a frequency term lower than min_cut
    or higer than max_cut will be ignored.
   """
   self._min_cut = min_cut
   self._max_cut = max_cut
   self._stopwords = set(get_stop_words('bn'))

示例#9

0

显示文件

文件： core.py 项目： mjabl/survey-toolkit

 def _summary(self, **kwargs):
     str_corpus = " ".join(self.answers).lower()
     words = re.sub(r"[^\w]", " ", str_corpus).split()
     stop_words = many_stop_words.get_stop_words(
         kwargs.get('language', 'en'))
     filtered_words = [word for word in words if word not in stop_words]
     summary_series = pd.Series(filtered_words).value_counts()[:20]
     summary_series.name = self.label
     return summary_series

示例#10

0

显示文件

 def remove_stopwords(self, doc):
     for word in many_stop_words.get_stop_words("en"):
         lexeme = self.nlp.vocab[word]
         lexeme.is_stop = True
     doc = [
         token.text for token in doc
         if token.is_stop != True and token.is_punct != True
     ]
     return doc

示例#11

0

显示文件

文件： preprocessors.py 项目： MatchzooDoc/zh

    def get_stopwords(self) -> list:
        """
        Get stopwords based on language.

        :params lang: language code.

        :return stop_list: list of stop words.
        """
        return many_stop_words.get_stop_words(self._lang)

示例#12

0

显示文件

def delete_stop_words_from_list(l):
    stop_words = list(get_stop_words('ru'))  # About 900 stopwords
    nltk_words = list(stopwords.words('russian'))  # About 150 stopwords
    stop_words.extend(nltk_words)
    out = []
    for x in l:
        if x[:x.find('_')] in stop_words:
            continue
        else:
            out.append(x)
    return out

示例#13

0

显示文件

文件： process_data_spark.py 项目： ayat-rashad/eg_twitter

def filter_words(words):
    new_words = FreqDist(words)
    stopwords = get_stop_words('ar')
    keys = new_words.keys()
    
    for word in keys:
        if word in stopwords:
            new_words.pop(word)
            
        if len(word) <= 2:
            new_words.pop(word)
            
    return new_words

示例#14

0

显示文件

def filter_words(words):
    new_words = FreqDist(words)
    stopwords = get_stop_words('ar')
    keys = new_words.keys()

    for word in keys:
        if word in stopwords:
            new_words.pop(word)

        if len(word) <= 2:
            new_words.pop(word)

    return new_words

示例#15

0

显示文件

文件： main.py 项目： mausLe/Fuzzy-Logic-Email-Classification

def tokenize(body):
	tokens = word_tokenize(body)
	tokens = [w.lower() for w in tokens]
	tokens = [w for w in tokens if len(w) > 2]
	table = str.maketrans('', '', string.punctuation)
	stripped = [w.translate(table) for w in tokens]
	words = [word for word in stripped if word.isalpha()]
	stop_words = list(get_stop_words('nl'))
	nltk_words = list(stopwords.words('dutch'))
	stop_words.extend(nltk_words)
	words = [w for w in words if not w in stop_words]
	stemmer = SnowballStemmer("dutch")
	words = [stemmer.stem(word) for word in words]
	return words

示例#16

0

显示文件

文件： analyze_experiment.py 项目： ceparadise/IntrermingualLang

 def replace_word_in_targetArtifact(self, replace_list):
     replaced_artifact_tokens = []
     for word, replacement in replace_list:
         for artif in self.targetArtifact:
             content = self.targetArtifact[artif]
             stop_words = many_stop_words.get_stop_words("en")
             for token in content.split():
                 token = token.lower()
                 if token not in stop_words and len(token) >= 2:
                     if token == word:
                         replaced_artifact_tokens.append(replacement)
                     else:
                         replaced_artifact_tokens.append(token)
     return " ".join(replaced_artifact_tokens)

示例#17

0

显示文件

文件： tokenizer_utils.py 项目： panzerstadt/aodh-2018

def tokenize_and_normalize_sentences(sentence, language=None, clean_http=True, debug=False):
    stemmer = LancasterStemmer()

    regex_set = regexEnJa().regex_en_ja_characters_set(whitespace=True, tabs_newlines=False, url=True)

    matches = re.finditer(regex_set, sentence, re.MULTILINE | re.IGNORECASE | re.VERBOSE | re.UNICODE)
    matches = [match.group() for match in matches]

    if debug:
        print('all matches')
        print(matches)

    if clean_http:
        matches = [x for x in matches if 'http' not in x]

    s = ''.join(matches)

    if debug:
        print('from: ', '<start>' + sentence + '<end>')
        print('='*100)
        print('to:   ', '<start>' + s + '<end>')
        print('')

    if language:
        lang_code = language
    else:
        lang_code = detect_language_code(sentence)

    # set ignored words (overly common words)
    # tokenize words
    if lang_code == 'en':
        ignore_words = set(stopwords.words('english'))  # english
        # nltk's word_tokenize for english
        words = english_tokenize(s)
    elif lang_code == 'ja':
        ignore_words = get_stop_words(lang_code)  # has japanese
        words = mecab_tokenize(s)
        # clean blanks (japanese only)
        words = [w for w in words if w is not ' ']
    else:
        # todo: handle other languages properly
        # currently using english tokenizer as stand in
        ignore_words = set(stopwords.words('english'))  # english
        # nltk's word_tokenize for english
        words = english_tokenize(s)

    root_words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]

    return root_words

示例#18

0

显示文件

文件： yelp.py 项目： hamdm/Yelp-Dataset-Analysis-

def load_stop_words():
    '''
    Appends english stopwords from nltk, many_stop_words,
    custom stopwords from custom_stopwords.txt, and specific stopwords to
    a list STOP_WORDS
    '''
    global STOP_WORDS
    STOP_WORDS = list(get_stop_words('en'))  # About 900 stopwords
    nltk_words = list(stopwords.words('english'))  # About 150 stopwords
    custom_stop_words = list(line.strip() for line in open('custom_stopwords.txt'))
    specific_stop_words = ['came', 'told', 'dont', 'outside', 'okay', 'ok',
                           'oh', 'really', 'never', 'everyone', 'went', 'sat',
                           'well', 'definitely']
    STOP_WORDS.extend(nltk_words)
    STOP_WORDS.extend(custom_stop_words)
    STOP_WORDS.extend(specific_stop_words)

示例#19

0

显示文件

文件： process_data_spark.py 项目： ayat-rashad/eg_twitter

def filter_token_tag(tok_tag, stopwords=get_stop_words('ar')):
    allowed_tags = ['NN', 'DTNN', 'NNS','NNP','NNPS', 'JJ', 'JJR', 'JJS']
    w, t = tok_tag

    if w in stopwords:
        return False
    if len(w) <= 2:
        return False
    try:
        if detect(w) != 'ar':
            return False
    except:
        return False
    if t not in allowed_tags:
        return False

    return True

示例#20

0

显示文件

def filter_token_tag(tok_tag, stopwords=get_stop_words('ar')):
    allowed_tags = ['NN', 'DTNN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']
    w, t = tok_tag

    if w in stopwords:
        return False
    if len(w) <= 2:
        return False
    try:
        if detect(w) != 'ar':
            return False
    except:
        return False
    if t not in allowed_tags:
        return False

    return True

示例#21

0

显示文件

def read_all_stop_words() -> Set[str]:
    # Data source: https://wenku.baidu.com/view/7ca26338376baf1ffc4fad6a.html
    with open("data/chinese_stop_words.txt", mode="r",
              encoding="utf-8") as local_file:
        text_lines = local_file.readlines()
        text_lines = list(x.replace("\n", "") for x in text_lines)

    with open("data/chinese_stop_symbols.txt", mode="r",
              encoding="utf-8") as local_file:
        symbol_lines = local_file.readlines()
        symbol_lines = list(x.replace("\n", "") for x in symbol_lines)

    public_stop_words = get_stop_words("zh")

    stop_words: Set[str] = set()
    stop_words = stop_words.union(text_lines)
    stop_words = stop_words.union(symbol_lines)
    stop_words = stop_words.union(public_stop_words)

    return stop_words

示例#22

0

显示文件

def get_keywords(sentence, allowed_tags):
    sentence = _remove_by_regex(_replace_punct(sentence))
    tokens = nltk.word_tokenize(sentence)
    tokens = [token.strip("'") for token in tokens]
    tagged_tokens = nltk.pos_tag(tokens)
    stop_words = get_stop_words('en')
    stop_words = {word.decode('utf-8') for word in stop_words}
    stop_words |= {'read'}
    keywords = []
    for word, tag in tagged_tokens:
        word = word.lower()
        if is_proper_keyword(word, tag, allowed_tags, stop_words):
            keywords.append(word)
    bigrams_keywords = list(bigrams(keywords))
    trigrams_keywords = list(trigrams(keywords))

    for k in bigrams_keywords:
        keywords.append(' '.join(k))

    for k in trigrams_keywords:
        keywords.append(' '.join(k))
    return keywords

示例#23

0

显示文件

 def generate(self):
     conn = psycopg2.connect(dbname=self.pgsql_creds["database"],
                             user=self.pgsql_creds["username"],
                             host=self.pgsql_creds["host"],
                             password=self.pgsql_creds["password"])
     cur = conn.cursor()
     cur.execute("SELECT message_txt FROM messages WHERE video_id = %s;",
                 (self.video_id, ))
     results = cur.fetchall()
     conn.close()
     self.ignored_words = set()
     stopwords_from_file = self.stopwords_file.read()
     for word in stopwords_from_file.split():
         self.ignored_words.add(word)
     self.stopwords_file.close()
     self.ignored_words = set.union(many_stop_words.get_stop_words("ja"),
                                    self.ignored_words)
     longstring = ""
     if self.mask_img:
         mask = np.array(self.mask_img)
     else:
         mask = None
     amount_scs = 0
     for superchat in results:
         if superchat[0]:
             amount_scs += 1
             if '_' not in superchat[0]:
                 mecabbed = do_mecab(superchat[0], '-Owakati')
                 longstring += " " + mecabbed
     print("generating wordcloud from %d messages", amount_scs)
     STOPWORDS.update(self.ignored_words)
     wordcloud = WordCloud(font_path=self.font,
                           collocations=False,
                           background_color="white",
                           width=1280,
                           height=720,
                           mask=mask).generate(longstring)
     dest_image = self.target_dir + self.video_id + "-wordcloud.png"
     wordcloud.to_file(dest_image)

示例#24

0

显示文件

文件： topics.py 项目： arghodayah/Youtube-Comments-Analyzer

def find_topics(comments, quantity):
    tokenizer = RegexpTokenizer(r'\w+')

    #Load stop words list
    stop_words = list(stopwords.words('arabic'))
    stop_words.extend(set(get_stop_words('ar')))
    #Stemmer definition
    p_stemmer = PorterStemmer()
    #Add comment to local list
    raw_data = []
    raw_data.extend(comments)
    #List for tokenized texts
    texts = []
    #Loop through raw texts
    for text in raw_data:
        #Clean and tokenize
        raw = text.lower()
        tokens = tokenizer.tokenize(raw)
        #Remove stop words from tokens
        stopped_tokens = [
            i for i in tokens if not i in stop_words and len(i) > 4
        ]
        #Stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        #Add tokens to final list
        texts.append(stemmed_tokens)
    #Turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)
    #Convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    #Generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=int(quantity),
                                               id2word=dictionary,
                                               passes=20)

    return (ldamodel.print_topics(num_topics=int(quantity), num_words=5))

示例#25

0

显示文件

def get_words_to_remove():
    """
    generate a list of words to remove for a better cleaning of tweets
    Returns:
        set : an array of words to remove
    """
    punctuation = list(string.punctuation)
    stop_word_list_english = stopwords.words('english')
    stop_word_list_french = stopwords.words('french')
    others_words = [
        'rt',
        'via',
        '...',
        '…',
        '»:',
        '«:',
        '’:',
        'les',
        '-',
    ]
    words_to_remove = punctuation + stop_word_list_english + \
        stop_word_list_french + others_words
    congo_words = {
        'congo',
        'congolais',
        'rdc',
        'drc',
        '-',
        'https',
        'rdcongo',
        'drc',
        'drcongo',
    }
    words_to_remove = set(words_to_remove).union(congo_words)
    words_to_remove = words_to_remove.union(
        set(many_stop_words.get_stop_words('fr')))
    return words_to_remove

示例#26

0

显示文件

文件： process_data_spark.py 项目： ayat-rashad/eg_twitter

def filter_edges(edges, words):
    new_edges = []
    stopwords = get_stop_words('ar')
    edges_word = FreqDist()
    max_e = 10
    
    for e, w in edges:
        if e[0] in stopwords or e[1] in stopwords:
            continue
        
        if len(e[0]) <= 2 or len(e[1]) <= 2:
            continue
        
        if e[0] not in words and e[1] not in words:
            continue

        if edges_word[e[0]] >= max_e or edges_word[e[1]] >= max_e:
            continue
        
        new_edges.append((e,w))
        edges_word[e[0]] += 1
        edges_word[e[1]] += 1
            
    return new_edges

示例#27

0

显示文件

def filter_edges(edges, words):
    new_edges = []
    stopwords = get_stop_words('ar')
    edges_word = FreqDist()
    max_e = 10

    for e, w in edges:
        if e[0] in stopwords or e[1] in stopwords:
            continue

        if len(e[0]) <= 2 or len(e[1]) <= 2:
            continue

        if e[0] not in words and e[1] not in words:
            continue

        if edges_word[e[0]] >= max_e or edges_word[e[1]] >= max_e:
            continue

        new_edges.append((e, w))
        edges_word[e[0]] += 1
        edges_word[e[1]] += 1

    return new_edges

示例#28

0

显示文件

文件： test_many_stop_words.py 项目： nhu2000/many-stop-words

def test_get_two():
    stop_words = get_stop_words('en', 'it')
    assert 'been' in stop_words  # English
    assert 'buono' in stop_words  # Italian
    assert 'bardzo' not in stop_words  # Polish

示例#29

0

显示文件

文件： Preprocessor.py 项目： ceparadise/IntrermingualLang

 def remove_stop_word(self, token_list, language="en", stop_words=None):
     if stop_words == None:
         if language == "ko":
             language = "kr"
         stop_words = many_stop_words.get_stop_words(language)
     return [x for x in token_list if x not in stop_words]

示例#30

0

显示文件

import json
from ttp import ttp
from nltk.corpus import stopwords
import string
import jsonpickle
from many_stop_words import get_stop_words
import time
from datetime import datetime
import pytz
from _datetime import tzinfo
from dateutil import parser

# stop_corpus = set(stopwords.words('english'))

#from konlpy.tag import Twitter; t = Twitter()
stop_corpus = get_stop_words('kr')
p = ttp.Parser()


def remove_remaining_punctuations(text):
    for c in string.punctuation:
        text = text.replace(c, '')

    return text


def remove_stop_words(text, stop_corpus):
    text = ' '.join([i for i in text.lower().split() if i not in stop_corpus])
    #text = ' '.join([i for i in t.morphs(text) if i not in stop_corpus])
    return text

示例#31

0

显示文件

def get_stopwords(language_code: str, extra_stopwords: {str}) -> {str}:
    available_languages = set(many_stop_words.available_languages)
    if language_code in available_languages:
        my_stopwords = many_stop_words.get_stop_words(language_code)
        return my_stopwords.union(extra_stopwords)

示例#32

0

显示文件

文件： soft_selectors.py 项目： mrG7/dossier.models

import string

import dblogger
from gensim import corpora
import many_stop_words
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams
import regex as re
import streamcorpus
from streamcorpus_pipeline._clean_html import clean_html
from streamcorpus_pipeline._clean_visible import clean_visible
import yakonfig


logger = logging.getLogger(__name__)
stop_words = many_stop_words.get_stop_words()


def find_soft_selectors(ids_and_clean_visible, start_num_tokens='10',
                        max_num_tokens='20', filter_punctuation='0'):
    '''External interface for dossier.models.soft_selectors.

    This at scans through `num_tokens` values between
    `start_num_tokens` and `max_num_tokens` and calls
    `find_soft_selectors_at_n` looking for results

    All of the params can be passed from URL parameters, in which
    case they can be strings and this function will type cast them
    appropriately.
    '''
    start_num_tokens = int(start_num_tokens)

示例#33

0

显示文件

文件： csvread.py 项目： muneebjan/PreTechx-tasks

foldernamelistNew = []
foldernamelistTitle = []
ListOfStagsPerFolder = []

Stemmer = PorterStemmer()

def TagExtractionFuction(String):
    string = String
    for alltagsExtr in soup.find_all(string):
            titletagslines = []
            tt = "".join(str(alltagsExtr))
            soup2 = bs(tt, "html.parser")
            Ttag = "".join(str(soup2.text))
            Ttags = re.sub(r'[\'\n]','', Ttag)
            
            TtagsWT = Ttags.split()   #tokenizing
            for Twords in TtagsWT:  
                #Twords = Twords.strip()#stopwords removing
                if Twords not in totalstopwords: #stopwords removing
                    
                    Twordss = re.sub(r'\\n', '',Twords)
                    Twordss = Twordss.replace('\s', "")
                    words2 = Stemmer.stem(Twordss)
                    titletagslines.append(words2)
            titletagslines1.append(titletagslines)
mystopwords = ['&','#','*','A','--','$','\\','_',"'n","'",'\\n',"', '","n't","'s","'\\n",' ',',','.','"','""',"''",'``',':','?','I','%','+','!','(',')','-',';','The']
stpw = list(get_stop_words('en'))
totalstopwords = mystopwords + stpw
for i in glob.glob("C:/Users/aa/.spyder/dataset/docs.with.sentence.breaks/*"):
file = np.array("")
print(file)

示例#34

0

显示文件

文件： helper.py 项目： nguanh/SharpMD

import string
from unidecode import unidecode
from nameparser import HumanName
from enum import Enum
from many_stop_words import get_stop_words

from .author_names import AMBIGUOUS_NAMES
punctuation_dict = str.maketrans({key: None for key in (string.punctuation)})
whitespace_dict = str.maketrans(
    {key: None
     for key in (string.whitespace.replace(" ", ""))})
ascii_dict = str.maketrans({key: None for key in (string.printable)})

suffix_list = ["jr", "jnr", "sr", "snr"]
stop_word_list = get_stop_words("en")

# TODO regex for latex and html


def normalize_title(title, latex=False):

    # translate unicode characters to closest ascii characters
    name_split = title.replace("-", " ")
    ascii_decoded = unidecode(name_split)
    remove_punctuation = ascii_decoded.translate(punctuation_dict)
    remove_whitespace = remove_punctuation.translate(whitespace_dict)
    lowered = remove_whitespace.lower()
    only_one_space = lowered
    # by removing certain unicode characters we introduced multiple spaces, replace them by only on space
    while '  ' in only_one_space:
        only_one_space = only_one_space.replace('  ', ' ')

示例#35

0

显示文件

文件： lexical_sentiment.py 项目： timotej-orcic/SIAP-2018

import re
import nltk

from many_stop_words import get_stop_words

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

city = 'Kansas City'
tweetsPath = 'TweetScraper\\TweetScraper\\Data\\' + city + '\\'
dirPath = 'TweetScraper\\TweetScraper\\Data\\'

stop_words = list(get_stop_words('en'))  #About 900 stopwords
nltk_words = list(stopwords.words('english'))  #About 150 stopwords
stop_words.extend(nltk_words)

#LOAD DICTIONARIES
with open(dirPath + 'pos_dictionary_lemmatized') as pdl:
    pos_dict_lem = pdl.read()
pos_words_lem = pos_dict_lem.split(' ')

with open(dirPath + 'neg_dictionary_lemmatized') as ndl:
    neg_dict_lem = ndl.read()
neg_words_lem = neg_dict_lem.split(' ')

with open(dirPath + 'pos_dictionary_stemmed') as pds:
    pos_dict_stem = pds.read()
pos_words_stem = pos_dict_stem.split(' ')

示例#36

0

显示文件

# get token and corpus for kashi
aimer = pickle.load(open("data/aimer.pickle", "rb"))
dic_loc = "data/pn_ja.dic.txt"
filter_pos = ["記号", "助詞", "助動詞", "接頭詞", "連体詞", "接続詞"]
aimer_lyrics = list(map(lambda x: x[3], aimer.song_pack))
aimer_token = JaToken(aimer_lyrics, 
                       dic_loc,
                       "1",
                       filter_pos)

stop_words = {"する", "られる", "さん", "てる", "ん","の", "dont", "こと",
"よう", "まま", "そう", "あなた", "もの", "いつ", "いつか", "ため",
"いる", "なる", "れる", "れる", "ない", "くい", "mum", "いい", "ほしい",
"しまう", "ある", "くれる", "できる", "来る", "ゆく", "行く", "言う", "せる", "くる", "いく",
"日々", "今日", "明日"}
stop_words = many_stop_words.get_stop_words("ja", "en").union(stop_words)

# noun analysis
aimer_noun = word_by_pos(aimer_token, stop_words, "名詞", filtered_length = 0, most_common_show = 20)
aimer_verb = word_by_pos(aimer_token, stop_words, "動詞", filtered_length = 0, most_common_show = 20)
aimer_adj = word_by_pos(aimer_token, stop_words, "形容詞", filtered_length = 0, most_common_show = 20)
aimer_all = word_by_pos(aimer_token, stop_words, "", filtered_length = 0, most_common_show = 20)
aimerW = [pd.DataFrame(aimer_noun[3].most_common(), columns = ["word", "cnt"]),
          pd.DataFrame(aimer_verb[3].most_common(), columns = ["word", "cnt"]),
          pd.DataFrame(aimer_adj[3].most_common(), columns = ["word", "cnt"])]

aimer_viz = [
   aimerW,
   aimer_all,
   pd.DataFrame(aimer.song_pack, columns = ["title", "lyricist", "composer", "lyrics"])]
pickle.dump(

示例#37

0

显示文件

文件： test_many_stop_words.py 项目： nhu2000/many-stop-words

def test_get_one(lang_code):
    stop_words = get_stop_words(lang_code)
    for word in stop_words:
        assert isinstance(word, unicode)
        assert u'\uFEFF' not in word
    assert len(stop_words) > 0

示例#38

0

显示文件

文件： test_many_stop_words.py 项目： nhu2000/many-stop-words

def test_get_all_basic():
    stop_words = get_stop_words()
    assert 'if' in stop_words

示例#39

0

显示文件

文件： test_many_stop_words.py 项目： nhu2000/many-stop-words

def test_get_all_equals_getting_all():
    assert get_stop_words() == get_stop_words(*available_languages)