def review_cleaner(df, new_stopwords=None):
    '''
    Takes a df, cleans the reviews in it and returns an updated df.
    '''

    reviews = list(df["Spell-checked review"])
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(["u", "also", "mok", "eric"])
    if new_stopwords != None and type(new_stopwords) == list:
        stopwords.extend(new_stopwords)

    cleaned_reviews = []
    for review in tqdm(reviews):
        review = review.lower()
        review = re.sub('(?!\s)((\S*)((.com)|(.com\/))(\S*))', ' ', review)
        review = re.sub('[^A-Za-z\s-]', '', review)
        review = re.sub('(\W?)(\d+)(\S?)((\d+)?)(((a|p)[m])|((\s)((a|p)[m]))?)', ' ', review)
        review = re.sub('(?!\w+|\s)--+(?=\s|\w+)', ' ', review)
        review = re.sub('(?!\w+)([,]+|[.][.]+|\/+)(?=\w+)', ' ', review)
        review = re.sub('([A-Z]([a-z]+))((\s[A-Z]([a-z]+))+)', ' ', review)
        review = re.sub('-', '', review)
        review = " ".join([word for word in review.split() if word not in stopwords])
        doc = nlp(review)
        lemmatized_review = " ".join([token.lemma_ for token in doc if token.lemma_ != "-PRON-"])
        cleaned_reviews.append(lemmatized_review)

    df["Cleaned review"] = cleaned_reviews

    return df
예제 #2
0
def lyrics_to_words(lyrics):
    '''
    helper function to clean out song lyrics. We apply porter Stemmer algorithm and remove stopwords
    '''
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = [
        'verse', '1', '2', 'chorus', 'bridge', 'talking', 'refrain', 'explain',
        'request'
    ]
    stopwords.extend(newStopWords)
    stemmer = PorterStemmer()

    words_english = set(nltk.corpus.words.words())

    remove_non_english = " ".join(
        w for w in nltk.wordpunct_tokenize(lyrics)
        if w.lower() in words_english or not w.isalpha())

    text = re.sub(r"[^a-zA-Z0-9]", " ",
                  remove_non_english.lower())  # Convert to lower case
    words = text.split()  # Split string into words
    words = [w for w in words if w not in stopwords]  # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words]  # stem

    return words
예제 #3
0
def create_maps():
	'''generates geojson files for creating maps of twitter activity
	broken down into weekend vs week day, and by major chunks of time.'''

	df = mm.pipeline.retrieve_and_merge_tweet_data()
	df = mm.pipeline.transform_timestamp(df, hour = True)
	df = get_tweet_rate(df)

	#remove geoids that are in the ocean
	odd_ids = ['060750601001016', '060750179021003','060759901000003',\
               '060759901000002', '060750179021000','060750601001000',\
               '060759804011003', '060750201001001']  
	df = df[~df['geoid10'].isin(odd_ids)]

	#customize stopwords for editing tokens
	stopwords = nltk.corpus.stopwords.words('english')
	stopwords.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a',\
	                '@','.', 'co', 'com','amp', 'via','http','htt','https', '()',']'])
	sstopwords=[unicode(word) for word in stopwords]

	#break dataframe into four hour chunks of time throughout the day
	df_hour = tweets_by_hour(df)
	#obtain geometry data for each geoid for mapping 
	df_hour = retrieve_geometry_information(df_hour)
	#get top ten tokens for each group 
	df_hour['top_ten'] = df_hour.tokens.apply(top_tokens)
	
	#generate geojsons 
	for time in df_hour.hr_bin.unique():
	    time_df = df_hour[df_hour['hr_bin']== time]
	    outfilename = 'data/' + time + '.json'
	    dataframe_to_geojson(time_df, outfilename)
예제 #4
0
def text_preprocessing(textIter):
    """
    remove stopwords, punctuation, etc., and stem/tokenize text strings
    :param textIter: iterable of text (e.g. list, dataframe, etc.)
    :return: list of tokens, grouped by document
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)
    # addstopwords = [None, 'nan', '']
    # stopwords.extend(addstopwords)

    reviewsList = textIter

    # clean up text to remove punctuation and empty reviews
    reviewsList[:] = [s.translate(None, string.punctuation).lower().split()
                      if str(s) not in (None, 'nan', '') else '' for s in reviewsList]

    # group the list of strings together and remove stopwords
    reviewsList[:] = list(itertools.chain(*reviewsList))
    print "count of tokens before stopword removal: ", len(reviewsList)
    reviewsList = [word for word in reviewsList if word not in stopwords]
    reviewsList = [word for word in reviewsList if re.search(r'[a-zA-Z]', word) is not None]
    print "count of tokens after stopword removal: ", len(reviewsList)

    # join all reviews together into one string
    tokens = nltk.word_tokenize(" ".join(reviewsList))

    return tokens
예제 #5
0
def stop_words(table):
    #We need to remove the stop words
    stopwords = nltk.corpus.stopwords.words('english')
    #spacy's stop words
    newStopWords = ['im','oh','lol','whence','id', 'here', 'show', 'were', 'why', 'n’t', 'the', 'whereupon', 'not', 'more', 'how', 'eight', 'indeed', 'i', 'only', 'via', 'nine', 're', 'themselves', 'almost', 'to', 'already', 'front', 'least', 'becomes', 'thereby', 'doing', 'her', 'together', 'be', 'often', 'then', 'quite', 'less', 'many', 'they', 'ourselves', 'take', 'its', 'yours', 'each', 'would', 'may', 'namely', 'do', 'whose', 'whether', 'side', 'both', 'what', 'between', 'toward', 'our', 'whereby', "'m", 'formerly', 'myself', 'had', 'really', 'call', 'keep', "'re", 'hereupon', 'can', 'their', 'eleven', '’m', 'even', 'around', 'twenty', 'mostly', 'did', 'at', 'an', 'seems', 'serious', 'against', "n't", 'except', 'has', 'five', 'he', 'last', '‘ve', 'because', 'we', 'himself', 'yet', 'something', 'somehow', '‘m', 'towards', 'his', 'six', 'anywhere', 'us', '‘d', 'thru', 'thus', 'which', 'everything', 'become', 'herein', 'one', 'in', 'although', 'sometime', 'give', 'cannot', 'besides', 'across', 'noone', 'ever', 'that', 'over', 'among', 'during', 'however', 'when', 'sometimes', 'still', 'seemed', 'get', "'ve", 'him', 'with', 'part', 'beyond', 'everyone', 'same', 'this', 'latterly', 'no', 'regarding', 'elsewhere', 'others', 'moreover', 'else', 'back', 'alone', 'somewhere', 'are', 'will', 'beforehand', 'ten', 'very', 'most', 'three', 'former', '’re', 'otherwise', 'several', 'also', 'whatever', 'am', 'becoming', 'beside', '’s', 'nothing', 'some', 'since', 'thence', 'anyway', 'out', 'up', 'well', 'it', 'various', 'four', 'top', '‘s', 'than', 'under', 'might', 'could', 'by', 'too', 'and', 'whom', '‘ll', 'say', 'therefore', "'s", 'other', 'throughout', 'became', 'your', 'put', 'per', "'ll", 'fifteen', 'must', 'before', 'whenever', 'anyone', 'without', 'does', 'was', 'where', 'thereafter', "'d", 'another', 'yourselves', 'n‘t', 'see', 'go', 'wherever', 'just', 'seeming', 'hence', 'full', 'whereafter', 'bottom', 'whole', 'own', 'empty', 'due', 'behind', 'while', 'onto', 'wherein', 'off', 'again', 'a', 'two', 'above', 'therein', 'sixty', 'those', 'whereas', 'using', 'latter', 'used', 'my', 'herself', 'hers', 'or', 'neither', 'forty', 'thereupon', 'now', 'after', 'yourself', 'whither', 'rather', 'once', 'from', 'until', 'anything', 'few', 'into', 'such', 'being', 'make', 'mine', 'please', 'along', 'hundred', 'should', 'below', 'third', 'unless', 'upon', 'perhaps', 'ours', 'but', 'never', 'whoever', 'fifty', 'any', 'all', 'nobody', 'there', 'have', 'anyhow', 'of', 'seem', 'down', 'is', 'every', '’ll', 'much', 'none', 'further', 'me', 'who', 'nevertheless', 'about', 'everywhere', 'name', 'enough', '’d', 'next', 'meanwhile', 'though', 'through', 'on', 'first', 'been', 'hereby', 'if', 'move', 'so', 'either', 'amongst', 'for', 'twelve', 'nor', 'she', 'always', 'these', 'as', '’ve', 'amount', '‘re', 'someone', 'afterwards', 'you', 'nowhere', 'itself', 'done', 'hereafter', 'within', 'made', 'ca', 'them']
    stopwords.extend(newStopWords)
    table = ' '.join([word for word in table.split() if word not in (stopwords)])
    return table
예제 #6
0
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    stopwords = nltk.corpus.stopwords.words('english')
    myStopWords = []
    stopwords.extend(myStopWords)
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words
def stopword_removal(words):
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = ['said', 'say', 'says', 'mr']
    stopwords.extend(newStopWords)
    word_filtered = []
    for w in words:
        if w not in stopwords:
            word_filtered.append(w)
    unique = list(dict.fromkeys(word_filtered))
    return " ".join(unique)
예제 #8
0
def get_similarity(df):
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = ['description', 'developers', 'describe']
    stopwords.extend(newStopWords)
    count = CountVectorizer(stop_words=newStopWords)
    count_matrix = count.fit_transform(df['soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    df = df.reset_index()
    indices = pd.Series(df.index, index = df['name'])
    return cosine_sim, indices
예제 #9
0
def calc_freq(file):
	stopwords = nltk.corpus.stopwords.words('english')
	stopwords.extend(["metaenddot", "metanumberref", "metaendquestion, metanumberrefs"])

	f = open(file,'r')
	raw = nltk.clean_html(f.read())
	raw = ''.join(ch for ch in raw if ch not in set(string.punctuation))
	tokens = [token.lower() for token in raw.split() if token not in stopwords] #generates a list of tokens
	
	fdist=FreqDist(tokens)
예제 #10
0
def text_process(text):
    strip_punc = [c for c in text if c not in string.punctuation]
    strip_punc = ''.join(strip_punc)
    strip_punc = strip_punc.lower()
    stopwords = nltk.corpus.stopwords.words('english')
    newstopwords = ['4', '2', '7', '3', '5', '1', '8', '0', '9', 'f', 'n', 'g', 'u', 'w', 'b', 'p', 'r', '6', 'k', 'x',
                    'cs', 'kp', 'kn', 'fa', 'ua', 'fo', 'st', 'jt', 'rr', 'pr', 'ey', 'gt', 'ff',
                    'lk', 'yo', 'um', 'jj', 'jh', 'ya', 'cr', 'th', 'lh', 'http']
    stopwords.extend(newstopwords)
    return [word for word in strip_punc.split() if word.lower() not in stopwords]
def stemm_stop(text):
    ps = PorterStemmer()
    #stop_words = stopwords.words("english")
    stopwords = nltk.corpus.stopwords.words('english')
    newStopWords = ['num', 'na', '#']
    stopwords.extend(newStopWords)
    filtered_words = []
    for i in text.split():
        if i not in stopwords:
            filtered_words.append(ps.stem(i))
    return " ".join(filtered_words)
 def remove_stop_words(self):
     all_stopwords = nltk.corpus.stopwords.words('english')
     if self.stop_words_list:
         stopwords.extend(self.stop_words_list)
     for item in self.items:
         for field in self.class_properties:
             current_field_value = getattr(item, field)
             setattr(item, field, [
                 word for word in current_field_value
                 if word not in all_stopwords
             ])
예제 #13
0
 def filter_it(self, titles):
     global l
     #filtered = re.sub(r'[^A-Za-z0-9 ]+', ' ',titles)
     newStopWords = [
         'Extractation Failed', 'mercado', 'libre', 'amazonde', 'amazon',
         'amazonca', 'en', 'amazonde', 'amazonfr', 'amazoncom',
         'amazoncouk', 'mercadolibre', 'ebay', 'ebaycom'
     ]
     stopwords = nltk.corpus.stopwords.words('english')
     stopwords.extend(newStopWords)
     word_tokens = word_tokenize(titles.lower())
     l = [w for w in word_tokens if not w in stopwords]
예제 #14
0
def clean_words(job_type_list,stopwords):
    from nltk.corpus import stopwords
    stopwords = ' '.join(stopwords)
    stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower()
    stopwords = stopwords.split(' ')
    stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering'])
    special_chars = ['--','...','\n','•','®','●','\n']
    a = ' '.join(job_type_list)
    a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case
    for char in special_chars:
        a = a.replace(char, ' ') #replace special char with a space
    resultwords = [word for word in a.split(' ') if word.lower() not in stopwords]
    return resultwords
예제 #15
0
def main():
    nlp = spacy.load('en_core_web_lg')
    path = os.getcwd() + '/data/'
    data_list = os.listdir(path)
    data_list = sorted(data_list)

    if '.DS_Store' in data_list:
        data_list.remove('.DS_Store')
    else:
        pass

    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(['–', '=', '>', '↩'])
    lemma = WordNetLemmatizer()
    holder = []
    for i in range(len(data_list)):
        fullpath = path + data_list[i]

        data = codecs.open(fullpath, 'r', 'utf-8')
        data_text = data.read()
        data_tokens = data_text.strip().split()
        lemma_data_tokens = [
            lemma.lemmatize(word.lower()) for word in data_tokens
        ]

        content = [
            lemma.lemmatize(word.lower()) for word in data_tokens
            if word.lower() not in stopwords
        ]
        fdist_nostop = nltk.FreqDist(content)
        fdist_stop = nltk.FreqDist(lemma_data_tokens)
        holder.append(data_text)

        print('-----------------------{}-----------------------'.format(
            data_list[i]))
        print('1 gram with stopwords: {}'.format(fdist_stop.most_common(10)))
        print('1 gram without stopwords: {}'.format(
            fdist_nostop.most_common(10)))

        for i in range(2, 6):
            bgs = nltk.ngrams(data_tokens, i)
            fdist = nltk.FreqDist(bgs)
            print('{} gram: {}'.format(i, fdist.most_common(10)))

    for k in range(len(holder)):
        nlp_doc = nlp(holder[k])
        for y in range(len(holder)):
            nlp_other_doc = nlp(holder[y])
            print("{} {}: {}".format(
                data_list[k], data_list[y],
                round(nlp_doc.similarity(nlp_other_doc), 4)))
예제 #16
0
def remove_stopwords(text):
    '''
    Funcao para remover stopwords em inglês
    input:
        text: string
    output:
        string
    '''
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english') #define aqui que é 'english'
    stopwords.extend(stopwords)
    #stopwords.extend(stop_pt)
    
    return ' '.join([word for word in str(text).split() if word not in stopwords])
예제 #17
0
    def cleanText(x):
        soup = BeautifulSoup(x, 'lxml')
        no_html_text = soup.get_text()
        tokens = nltk.word_tokenize(no_html_text)
        tokens = [w.lower() for w in tokens]
        words = [word for word in tokens if word.isalpha()]
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        stopwords = nltk.corpus.stopwords.words('italian')
        stopwords.extend(string.punctuation)
        stopwords.extend(nltk.corpus.stopwords.words('english'))
        words = [w for w in stripped if w.isalpha() and not w in stopwords]

        return " ".join(words)
예제 #18
0
    def get_value(self, text):
        if self.executable is None:
            raise Exception('An executable is necessary.')

        stopwords = []
        if self.stem_patterns:
            for pattern in self.patterns:
                stopwords.extend(pattern.split(' '))

        # remove duplicate
        stopwords = list(set(stopwords))

        return self.executable(
            CleanedText(text, additional_stopwords=stopwords))
    def get_stopwords(self, language):
        """
            Funcao responsavel carregar a lista de stopwords para o processamento

        Params:
            language (string): idioma de busca das stopwords

        Returns:
            stopwords (list): lista de stopwords carregadas
        """

        stopwords = nltk.corpus.stopwords.words(language)
        stopwords.extend(['?', '.', ',', '(', ')', '!'])

        return stopwords
예제 #20
0
def freqdst(ks, stopwords=stopwords, leaveout=[]):
    stopwords.extend(leaveout)
    tokenizer = nltk.word_tokenize
    txtout = dict.fromkeys(ks.keys())
    for k in ks.keys():
        txt = ' '.join([i['TI'] for i in ks[k]])
        txt = txt.lower()
        try:
            tok = tokenizer(UnicodeDammit(txt).unicode_markup)
        except UnicodeEncodeError:
            pass  # dan niet
        tok = [t for t in tok if t not in stopwords]
        tok = [t for t in tok if len(t) > 2]
        txtout[k] = tok
    return txtout
예제 #21
0
def get_all_stopwords(character_names=True):
    stopwords = []
    for file in os.listdir("resources"):
        with open(os.path.join(my_path, "resources", file)) as infile:
            if file == "char_stopwords.txt":
                if character_names == False:
                    pass
                else:
                    words = []
                    w = [line.strip() for line in infile.readlines()]
                    stopwords.extend(words)
            else:
                words = []
                w = [line.strip() for line in infile.readlines()]
                stopwords.extend(words)
            return list(set(stopwords))
def remove_stopwords(input_str, add_stopwords=[], exclude_stopwords=[]):
    #''' returns a string, optionally add a list of words to the stopword list
    #  , also optionally add a list of words to exclude from stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    if len(add_stopwords) > 0:
        stopwords.extend(add_stopwords)
    if len(exclude_stopwords) > 0:
        final_stop_words = [
            word for word in stopwords if word not in exclude_stopwords
        ]
    else:
        final_stop_words = stopwords
    stopwords = final_stop_words
    input_list = input_str.split()
    without_stopwords = [word for word in input_list if word not in stopwords]
    return_str = ' '.join(without_stopwords)
    return return_str
예제 #23
0
def create_maps():
    '''generates geojson files for creating maps of twitter activity
    broken down into weekend vs week day, and by major chunks of time.'''

    df = mm.pipeline.retrieve_and_merge_tweet_data()
    wkd_df = mm.pipeline.transform_timestamp(df, DOW = True)
    wkd_df = get_tweets_per_day(wkd_df)

    #remove geoids that are in the ocean
    odd_ids = ['060750601001016', '060750179021003','060759901000003',\
               '060759901000002', '060750179021000','060750601001000',\
               '060759804011003', '060750201001001']  
    df = df[~df['geoid10'].isin(odd_ids)]

    #get the average number of tweets per day for every day of the week
    wkd_df = wkd_df.groupby(['geoid10', 'DOW']).agg(np.mean).reset_index()
    #get a grouped sum of the words
    wkd_df_txt = wkd_df.groupby(['geoid10', 'DOW'])['tokens'].apply(lambda x: ','.join(x)).reset_index()
    #merge these two dataframes together
    wkd_df['tokens'] = wkd_df_txt['tokens']

    #create a dataframe of only weekend values
    df_weekend = seperate_weekends(wkd_df, True)
    #create a dataframe of only weekday values 
    df_weekday = seperate_weekends(wkd_df, False)

    #customize stopwords for editing tokens
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a', ',',\
                    '@','.', 'co', 'com','amp','?' 'via','http','htt','https', '()',']'])
    stopwords.extend([str(char) for char in punctuation])
    sstopwords=[unicode(word) for word in stopwords]

    #get the most frequent words for visualization
    dfwkday['top_ten'] = dfwkday.tokens.apply(top_tokens)
    df_weeknd['top_ten'] = df_weeknd.tokens.apply(top_tokens)

    #get geometry information for each san francisco block
    df_end = retrieve_geometry_information(df_weeknd)
    df_day = retrieve_geometry_information(dfwkday)

    #generate geojsons 
    dataframe_to_geojson(df_end, 'data/weekend.json')
    dataframe_to_geojson(df_day, 'data/weekday.json')
예제 #24
0
def stem_tokenize(str_use):
    """
    Takes a string and tokenizes it, stripping it of punctuation and stopwords. Returns a list of strings.
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)
    stopwords.append('')
    addstopwords = ["in", "on", "of", "''"]
    stopwords.append(addstopwords)
    stemmer = wordnet.WordNetLemmatizer()
    tokenizer = punkt.PunktWordTokenizer()

    # removes stopwords and punctuation, then splits the string into a list of words
    token = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(str_use)
             if token.lower().strip(string.punctuation) not in stopwords]
    text = [word for word in token if re.search(r'[a-zA-Z]', word) is not None]
    stem = [stemmer.lemmatize(word) for word in text]
    # Returns a list of strings
    return stem
예제 #25
0
def generate_word_list(text_col, nr_words = n):
    tokens = word_tokenize(text_col.to_string()) # tokenize
    lower_tokens = [t.lower() for t in tokens] # Convert the tokens into lowercase: lower_tokens
    alpha_only = [t for t in lower_tokens if t.isalpha()] # Retain alphabetic words: alpha_only
    stopwords = nltk.corpus.stopwords.words('english') # Remove all stop words: no_stops
    newStopWords = ["rt", "bitcoin", "crypto", "cryptocurrency", "blockchain", "blockcha", "btc", "bitcoi", "bitcoins", "daily", "say", "could",
                   "price", "ethereum", "eth", "classic", "exchange", "market", "cryptocurrencie", "one", "first", "short", "check",
                   "cryptocurrencies", "http", "htttp", "hour", "list", "u", "new", "vi", "ccn", "etc", "usd"]
    stopwords.extend(newStopWords)
    no_stops = [t for t in alpha_only if t not in stopwords]
    wordnet_lemmatizer = WordNetLemmatizer() # create instance of the WordNetLemmatizer class
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops if len(t)>1] # Lemmatize all tokens into a new list
    lemmatized = [t for t in lemmatized if t not in stopwords] # remove stopwords again after lemmatization
    bow = Counter(lemmatized) # Create the bag-of-words: bow
    word = []
    word_count = []
    for i in range(nr_words):
        word.append(bow.most_common(nr_words)[i][0])
        word_count.append(bow.most_common(nr_words)[i][1])
    words_and_counts_df = pd.DataFrame({"word":word, "word_count":word_count})
    return(words_and_counts_df) # return the n most common tokens
    def is_ci_stem_stopword_set_match(self, a, b, threshold=0.5):
        # Get default English stopwords and extend with punctuation
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')

        # Create tokenizer and stemmer
        tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
        stemmer = nltk.stem.snowball.SnowballStemmer('english')
        """Check if a and b are matches."""
        tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        stems_a = [stemmer.stem(token) for token in tokens_a]
        stems_b = [stemmer.stem(token) for token in tokens_b]

        # Calculate Jaccard similarity
        ratio = len(set(stems_a).intersection(stems_b)) / float(
            len(set(stems_a).union(stems_b)))
        return (ratio >= threshold)
예제 #27
0
def get_reviews(fname):
    """
    get review text from the data set
    :param fname: file name of data set; expecting csv
    :return: pandas dataframe of text reviews (strings)
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)

    try:
        with open(fname, "rb") as infile:
            df = pd.DataFrame.from_csv(infile, header=0, index_col=False)
            # drop any review entries that are blank
            print "length of df: ", len(df)
            df = list(df['r_text'].dropna())
            print "... after removing NAs: ", len(df)
            # clean up text to remove punctuation and empty reviews
            reviewsList = [s.replace('\n', '').lower() for s in df]

        return reviewsList
    except:
        raise IOError
예제 #28
0
def parse_text(txt):
    txt = txt.lower()  #converting to lowercase
    #removing punctuation and digits
    p = string.punctuation
    d = string.digits
    tables = str.maketrans(p, len(p) * " ")
    text1 = txt.translate(tables)
    tables = str.maketrans(d, len(d) * " ")
    text1 = text1.translate(tables)

    words = word_tokenize(text1)  #tokenization
    #lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    words1 = [wordnet_lemmatizer.lemmatize(token) for token in words]

    #removing stopwords
    stopwords = nltk.corpus.stopwords.words("English")
    extra_stopwords = [
        'rt', 'RT', 'TakeTheKnee', 'taketheknee', 'TakeAKnee', 'takeaknee'
    ]  #adding stopwords
    stopwords.extend(extra_stopwords)
    words = [w for w in words1 if w not in stopwords]
    return " ".join(words)
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
import seaborn

## 2. Tokenizing
## 3. Removal of stopwords
## 4. Stemming

stopwords = nltk.corpus.stopwords.words("english")

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)
stemmer = PorterStemmer()


def preprocess(comments_dataset):

    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    comments_dataset_space = comments_dataset.str.replace(regex_pat, ' ')

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    comments_dataset_name = comments_dataset_space.str.replace(regex_pat, '')

    # removal of links[https://abc.com]
    giant_url_regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
# Get list of each track lyric text files
# ---------------------------------------

# ------------------NELSON---------------------------------------
import os
from itertools import chain
from nelsonfunctions import inforemove, infocheck, nelsonlinewrite

# ------------------MODIFY HERE-----------------------------------
path = os.getcwd() + "\\Counts Program\\SourcePath.txt"
infobuzzwords = ["typed", "artist", "album", "title", "song"]
stopwords = nltk.corpus.stopwords.words("english")
outfile = "top_20_words_per_lyric_file.csv"
# ------------------------------------------------------

stopwords.extend([x.strip() for x in open("newstopwords.txt.")])  # combine with new words


FILE = open(path, "r")
for line in FILE:
    line = line.replace("\n", "")
    tracks.append(line)
FILE.close()

# For every track get count of words that match keywords
# ------------------------------------------------------

word = []

for track in tracks:
    word = []  # Clear word buffer
예제 #31
0
    if len(df) < 1:
        st.error("Não foi possível recuperar dados do twitter do parlamentar")

    else:

        #criando uma lista com todas as palavras dos ultimos 200 twites
        big_string = ''
        for i in range(len(df)):
            big_string = big_string + df[i]['text']

        #Definindo stopwords
        stopwords = stopwords.words('portuguese') + list(punctuation)
        stopwords.extend([
            'https', 'http', 'sobre', 'vamos', 'co', 'rt', 'todos', 'todo',
            'rs', 'vc', 'ser', 'pra', 'tudo', 'vai', 'vcs', 'www', 'br',
            'coisa', 'hoje', 'dia', 'saiba', 'html', 'htm', 'via'
        ])

        # Inserindo título e hiperlink acima da nuvem de palavras
        end = df[0]['link'].split('/status')[0]
        link_perfil = 'Nuvem de Palavras (<a href= %(end)s target="_blank">Twitter</a>)' % {
            'end': end
        }

        st.markdown("### " + link_perfil, unsafe_allow_html=True)

        # Criando WordCloud
        wordcloud = WordCloud(stopwords=stopwords,
                              background_color='white').generate(
                                  big_string.lower())
예제 #32
0
#Morgan Smith
#This is a program to classify Github Repositories and label them with possible points of contributions.
#NLP will be used on README files as well as the most recent issues published.LDA is used to
#find the needed contributions.
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

#filter with stopwords
stopwords = (nltk.corpus.stopwords.words('english'))
newStops = ['.', '#', ',', 'txt']
stopwords.extend(newStops)

#opening file to be tokenized
f = open('C:/Users/jmorg/390/390SNA/README.txt')
raw = f.read()

#creating tokenized word list
tokens = word_tokenize(raw)
lemma = WordNetLemmatizer()
wordList = []
lower = (w.lower() for w in tokens)

for i in lower:
    if i not in stopwords:
        i = wordList.append(lemma.lemmatize(i))
예제 #33
0
    con_words.append(tokens)

con_words = [sl for li in con_words for sl in li]

adv_words = []

for sentence in adv_sentences:
    tokens = word_tokenize(sentence)
    adv_words.append(tokens)

adv_words = [sl for li in adv_words for sl in li]

#removing stopwords and adding few more stopwords to this words
stopwords = stopwords.words('english')
addstopwords = ['please','make','still']
stopwords.extend(addstopwords)

#filter word tokens by removing stopwords

filtered_pos_words = []
for word in pos_words:
    if word not in stopwords:
        filtered_pos_words.append(word)

filtered_con_words = []
for word in con_words:
    if word not in stopwords:
        filtered_con_words.append(word)

filtered_adv_words = []
for word in adv_words:
예제 #34
0
from collections import defaultdict
from tqdm import tqdm

import glob
import pandas as pd
from itertools import product

import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
import numpy as np
stopwords = stopwords.words('english')
stopwords.extend(['sp', 'ssp', 'var'])

MIN_CONF = 0.5
ALPHA = 20

from time import time

import argparse


def clean_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text
예제 #35
0
                      'Watch', 'Podcast', 'Foreign', 'Edition', 'Podcast', 'Opinion', 'Notable', 'Quotable', 'opinion', \
                      'Best', 'Web', 'Newsletter', 'opinion', 'Morning', 'Editorial', 'Report', 'Newsletter', 'section-link',\
                      '3qFFDClt', 'Life', 'Arts', 'Arts', 'Books', 'Cars', 'Food', 'Drink', 'Health', 'Ideas', 'Science', 'Sports',\
                      'Style', 'Fashion', 'Travel', 'Magazine', 'Puzzles', 'Future', 'Everything', 'Far', 'Away', 'Life', 'Arts',\
                      'section-link', '3qFFDClt', 'House', 'Day', 'section-link', '3qFFDClt', 'Magazine', 'Fashion', 'Art', 'Design',\
                      'Travel', 'Food', 'Culture', 'returnLink', '235Zspdg', 'mailto', 'support', '@', 'support', '@', 'strap',\
                      'Articles', 'img', 'U.S.', 'Ban', 'Travel', 'From', 'image', '2srBg4oD', '1x', '2x', '3x','/h3', '1zGPJwbt',\
                      'div', 'image-container', '3SkfuWVV', '/', '/div', '1zGPJwbt', 'h3', 'episode-name', '3Xrkqwfv', '/h3', \
                      '1zGPJwbt', 'div', 'image-container', '3SkfuWVV', '/', '/div', '1zGPJwbt', 'h3', 'episode-name', '3Xrkqwfv',\
                      'Cookie', 'Policy', 'Copyright','3qZEiy_G', 'skipToMainButton', '-1', 'Skip', 'Main','instagram', '1nV6js1B', \
                     'Instagram', 'Instagram', 'youtube','$','brand-link', '21t2Ybqa', 'masthead-strap-link', '3Kba64tv', 'Print',\
                     'masthead-strap-link', '3Kba64tv','Privacy', 'Data', 'Subscriber', 'Agreement', 'Terms', 'Use', \
                     'cookies-advertising', 'Choices',
                     ]

    stopwords.extend(newstpwrds)

    with open(f'ZMSMDummy_{today}.txt', 'a') as fo:
        fo.write(text)

    with open(f'ZMSMDummy_{today}.txt', 'r', errors='ignore') as fo1:
        csvWriter = csv.writer(fo1)
        msm = fo1.readlines()

    for i in msm:
        clean = []
        tokenized_var = word_tokenize(i)
        for word in tokenized_var:
            if not word in stopwords and "https" not in word \
                    and '.com' not in word \
                    and 'index' not in word \
import numpy as np
from PIL import Image
import json
import multiprocessing as mp

source_directory = "books"
data_directory = "bookdata"

source_files = [f.split(".")[0] for f in os.listdir(source_directory)]
done_files = [f.split(".")[0] for f in os.listdir(data_directory)]
files = []
gfile = open("custom_stopwords.txt","r")

to_remove = [l.strip() for l in gfile.readlines()]
stopwords = stopwords.words('english')
stopwords.extend(to_remove)

# Creating a list of already done files in case the process gets interrupted
for file in source_files:
    if file not in done_files:
        files.append(file)
file_count = len(files)

def process(i):
    data = {}
    file = open(f"{source_directory}/{files[i]}.txt","r")
    print(f"Starting {i}: {files[i]}")
    text = file.read()
    tokens = word_tokenize(text)
    ## Not doing text.lower() to maintain the case of the words
    for word in tokens:
예제 #37
0
파일: lda.py 프로젝트: timdestan/sandbox
import math
import time
import sys

# Assuming English everything for now, would require a bit
# of refactoring to try this sampler on a corpus in another
# language.
#
stemmer = EnglishStemmer()

# Default stopwords from NLTK
stopwords = stopwords.words('english')
# Add some of our own.
stopwords.extend(['european', 'commission','like','must','also','would',\
  'mr','mrs','go','.',',','?','new','put','way','use', 'policy', 'europe',\
  'need', 'member','preside','state', 'parliament', 'union', 'make', \
  'propose', 'country', 'council', 'report', 'take', 'develop', 'right', \
  'question', 'therefore'])
stopwords = set([stemmer.stem(x) for x in stopwords])

def should_exclude(word, sw=stopwords):
  """
  Should we exclude word?

  :param word: a word
  :param sw: a set of stopwords to exclude

  :return: True iff we want to exclude word.
  """
  return (word.lower() in sw) or (len(word) <= 3)
예제 #38
0
#from cochranenlp.textprocessing.drugbank import Drugbank

import nltk
from nltk.corpus import stopwords 

from noaho import NoAho

#import fuzzywuzzy

#drugbank = Drugbank()
stopwords = stopwords.words('english')
# hand-crafted and intended for targeting interventions!
# some of these are just words that are likely to be shared
# between cdsr text and abstract, even though not describing
# interventions specifically
stopwords.extend(["either", "time", "patients", "or"])

def distantly_annotate(n=None):
    '''
    e.g., 

    > tagged_pmids, tagged_abstracts, tokens_and_lbls, intervention_texts = distant_intervention_tag.distantly_annotate(500)
    '''
    bviewer = BiViewer()
    return ds_interventions_abstracts(bviewer, num_studies=n) 

def _tag_drugs(study):
    intervention_text = _iv_for_study(study)
    abstract = _abstract_for_study(study)

    interventions_tokens = nltk.word_tokenize(intervention_text)
예제 #39
0
import datetime
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.stem import SnowballStemmer
from pattern.text.en import lemma
from pymongo import MongoClient
import collections
import re

# stemmer = SnowballStemmer("english")
splits = re.compile('[\s\[\]\?().,;:\'"/]+')
lemmatizer = WordNetLemmatizer()

stopwords = list(stopwords.words('english'))
other_words = ['used', 'propose', 'provide', 'show', 'set', 'also']
stopwords.extend(other_words)
stopwords = [unicode(line.strip('\n')) for line in open('./stop_words.txt')]
# str 13041
# unicode 后 13042
# C,R,go

is_num = re.compile('^[\d|-|=]+$')
"""
"""


class Count(object):
    def __init__(self, host='127.0.0.1:27017', db_name='esi', doc_name='test',
                 key=None, result=None,
                 show_result=True,
                 reset_result=True,
예제 #40
0
#!/usr/bin/env python

from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.extend(['table', 'tables','host', 'hosts', 'delicious', 'anything', 'everything', 'something', 'host', 'lunch', 'aka', 'menu', 'menus', 'fare', 'buffet', 'lunches', 'dinner', 'dinners', "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", 'ingredient', 'ingredients', 'cuisine', 'cuisines', 'restaurant', 'restaurants', 'board', 'boards', 'waitstaff', 'waiter', 'waitress', 'waiters', 'waitresses', 'waitperson', 'scratch', 'scratches', 'potluck', 'potlucks', 'msg', 'feed', 'feeds', 'feeding', 'drinkable', 'drinkables', 'dishses', 'dietary', 'dieting', 'diets', 'colouring', 'coloured', 'colour', 'conserve', 'center', 'centre', 'centers', 'centres', 'bite', 'spoon', 'spoons', 'spoonful', 'yumm', 'yummy', 'edible', 'drunk', 'drank', 'drunken', 'vegetarianism', 'vegetarian', 'vegetarians', 'takeout', 'mess', 'messy', 'messing', 'messes', 'leftovers', 'leftover', 'end', 'ends', 'ending', 'joint', 'joints', 'carb', 'carbs', 'carbohydrate', 'carbohydrates'])
예제 #41
0
def csw(stopwords,filename):
	#Given the existing stopword list and a file of new stopwords, return combines 
	listofwords = [x.strip() for x in open(filename)]
	stopwords.extend(listofwords) 
	return(stopwords)
예제 #42
0
# Find where the nltk.file is
import nltk

print(nltk.__file__)

## sample text
sample = gutenberg.raw("bible-kjv.txt")
## fp = open('C:/Users/MyStyle/Desktop/WordAnalyze/Text/Trump.txt', 'r', encoding='utf-8')
## sample = fp.readline()
tok = word_tokenize(sample)

stopwords = nltk.corpus.stopwords.words('english')
newStopWords = [
    ',', '.', ':', ';', '?', 'And', 'I', '!', '``', '\'s', '-', '—'
]
stopwords.extend(newStopWords)
filtered_sentence = [w for w in tok if not w in stopwords]
filtered_sentence = []
for w in tok:
    if w not in stopwords:
        filtered_sentence.append(w)
mytext = nltk.Text(filtered_sentence)
filter_dist = nltk.FreqDist(filtered_sentence)
print(filter_dist.most_common(50))


## 詞彙多樣性 (相異單詞數量/總單詞數量)
def lexical_diversity(text):
    return len(set(text)) / len(text)

예제 #43
0
__author__ = 'prashantravi'
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TreebankWordTokenizer
from nltk.tag.stanford import StanfordPOSTagger
from nltk.corpus import stopwords
from readData import DataModel
from itertools import chain, imap
from sentiWord import get_scores
stopwords = stopwords.words("English")
stopwords.extend(['#', ',', '+', '.'])
punctuation = ".,:;!?\""

def transformTweetData(tweet):
    content = unicode(tweet.sentence.lower(), errors='ignore')
    words = content.strip().split()
    tokenizer = TreebankWordTokenizer()
    extra_features = []
    content = " ".join(words + extra_features)
    tokens = tokenizer.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

def remove_punctuation(input_string):
    for item in punctuation:
        input_string = input_string.replace(item, '')
    #print input_string
    return input_string

def main():
    sentence = raw_input("What's your sentence? ");
    dataModel = DataModel(None, None, None,None, None, None, sentence.lower());