Python union примеры, nltk.corpus.stopwords.union Python примеры использования

Пример #1

0

Показать файл

Файл: 624_FINAL_CODE_SabKhalid.py Проект: sabk18/CompanyAnalysis_TextMining

def word_clouds(list_, title):

    #Transforming the list into a string
    str_ = ' '.join(list_)

    stopwords = set(STOPWORDS)
    #removing any extra bigger words from these wordclouds
    stop_words = ["want", "company", "lot", "many", "work"]
    new_stopwords = stopwords.union(stop_words)

    #Defining the wordcloud parameters and #Generate word cloud
    wc = WordCloud(background_color="white",
                   max_words=200,
                   max_font_size=40,
                   scale=3,
                   random_state=42,
                   stopwords=new_stopwords).generate(str_)
    plt.figure(figsize=(30, 30))

    #store to file
    wc.to_file('company.png')

    #Show the cloud
    plt.imshow(wc)
    plt.axis('off')
    plt.title(title)
    plt.show()
    return

Пример #2

0

Показать файл

def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(10,10), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='white',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=800, 
                    height=400,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()

Пример #3

0

Показать файл

def make_tokens(df):
    """Removes stopwords, stems and lemmatizes
    Returns clean tokens"""

    stopwords = set(nltk.corpus.stopwords.words("english"))

    # turns the text in the dataframe into a long list of words
    TotalText = list(df.text.values)

    # stopwords, with plurals (otherwise the lemmatizong steps puts some of the stopwords back)
    NEW_STOP_WORDS
    stopwords = stopwords.union(NEW_STOP_WORDS)
    TotalText = " ".join(TotalText)

    # tokenization
    tokens = [
        w for w in word_tokenize(TotalText.lower()) if w.isalpha()
    ]  # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase
    no_stop = [
        t.strip() for t in tokens if t.strip() not in stopwords
    ]  # stopwords already comes with a built-in list of words to remove
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop]

    return lemmatized

Пример #4

0

Показать файл

def addStopWords(extrastopfile="../../data/supplementalremovedwords.txt"):
    global stopwords
    readRegionalisms()
    regionalisms = getRegionalisms()

    extrastopfile = open(extrastopfile, "r+")
    extrastopfile_text = extrastopfile.read()
    extrastopfile.close()
    # filter out regionalims from the stop words
    stopwords.union(set(extrastopfile_text.split()))

    #avoid filtering out part of a regionalism if it's two words
    for word in regionalisms:
        regionalisms.union(set(word.split()))
        
    stopwords.difference_update(regionalisms)

Пример #5

0

Показать файл

Файл: senti_mend.py Проект: rmordona/myrepo

def sentiment_swn(doc):
    
    operators = set(['not','down'])
    stopwords = set(ENGLISH_STOP_WORDS) - operators
    stopwords = stopwords.union(['gonna', 'does','the','of','and','to','in','a','is','that','for','it'])

    # This uses TF-IDF with both unigram and bigram and with maximum words (features) of 3000
    # It uses IDF and stsop words, also discarding numbers (token_pattern)
    # Sublinear set to true to further penalize long documents, which in our case may not be required ( no use in our case )
    #
    tf = TfidfVectorizer( analyzer='word', ngram_range=(1,2), lowercase= True,   min_df=1 , max_df=2, 
                     max_features=MAX_FEATURES, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=True, 
                     stop_words = stopwords,   token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b')

    weight = 0;

    try:

       # apply TF-IDF
       tfidf_matrix =  tf.fit_transform(doc)

       # get pre-processed terms
       feature_names = tf.get_feature_names() 

       if lemmatize_first == True:

          # lemmatize first before pos-tag
          lema_stem =   [lematizer.lemmatize(w)  for w in feature_names]
   
          # apply pos-tags after lemmatize
          tokens_pos = pos_tag(feature_names)

          # now transpost
          tokens_pos = [transpose(term) for term in tokens_pos]

       else:

          # apply pos-tags
          tokens_pos =   pos_tag(feature_names)

          # lemmatize using pos-tags
          tokens_pos = pos_tag( lemmatize(tokens_pos) )

          # now pos-tag it again and transpose
          tokens_pos = [transpose(term) for term in tokens_pos]
  
       logging.debug("POS-Tag {}".format(tokens_pos))

       # finally, get the sentiment weight
       weight =  sentiment_weight( tokens_pos )

    except:
      logging.debug("Error in sentiment ...")
    return weight

Пример #6

0

Показать файл

def plot_wordcloud(text,
                   mask=None,
                   max_words=200,
                   max_font_size=100,
                   figure_size=(24.0, 16.0),
                   color='white',
                   title=None,
                   title_size=40,
                   image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {
        'u', "im", "thi", "ji", "us", "ha", "um", "hi", "be", "will", "by",
        "is", "of", "to"
    }
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color=color,
                          stopwords=stopwords,
                          max_words=max_words,
                          max_font_size=max_font_size,
                          random_state=42,
                          width=200,
                          height=200,
                          mask=mask,
                          colormap=matplotlib.cm.inferno)
    wordcloud.generate(text)

    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask)
        plt.imshow(wordcloud.recolor(color_func=image_colors),
                   interpolation="bilinear")
        plt.title(title,
                  fontdict={
                      'size': title_size,
                      'verticalalignment': 'bottom'
                  })
    else:
        plt.imshow(wordcloud)
        plt.title(title,
                  fontdict={
                      'size': title_size,
                      'color': 'black',
                      'verticalalignment': 'bottom'
                  })
    plt.axis('off')
    plt.tight_layout()
    return wordcloud

Пример #7

0

Показать файл

Файл: Phishing website (1).py Проект: sameera-shaikh/Detecting-Phishing-URL

def plot_wordcloud(text,
                   mask=None,
                   max_words=400,
                   max_font_size=120,
                   figure_size=(24.0, 16.0),
                   title=None,
                   title_size=40,
                   image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'com', 'http'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='white',
                          stopwords=stopwords,
                          max_words=max_words,
                          max_font_size=max_font_size,
                          random_state=42,
                          mask=mask)
    wordcloud.generate(text)

    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask)
        plt.imshow(wordcloud.recolor(color_func=image_colors),
                   interpolation="bilinear")
        plt.title(title,
                  fontdict={
                      'size': title_size,
                      'verticalalignment': 'bottom'
                  })
    else:
        plt.imshow(wordcloud)
        plt.title(title,
                  fontdict={
                      'size': title_size,
                      'color': 'green',
                      'verticalalignment': 'bottom'
                  })
    plt.axis('off')
    plt.tight_layout()

Пример #8

0

Показать файл

Файл: Ireland_Analysis_210128_AG_jupyter-Copy1.py Проект: lisbonne21/IIPE-data

def make_tokens(df):
    """Removes stopwords, stems and lemmatizes
    Returns clean tokens"""

    stopwords = set(nltk.corpus.stopwords.words('english'))

    #turns the text in the dataframe into a long list of words
    TotalText = []
    for index, row in df.iterrows():
        text = row['text']
        TotalText.append(text)

    #stopwords, with plurals (otherwise the lemmatizong steps puts some of the stopwords back)
    newStopWords = [
        'school', 'learning', 'student', 'pupil', 'teacher', 'management',
        'teaching', 'support', 'lesson', 'board'
    ]
    newStopWords_plur = [
        'schools', 'learnings', 'students', 'pupils', 'teachers',
        'managements', 'teachings', 'supports', 'lessons', 'boards'
    ]
    newStopWords += newStopWords_plur
    stopwords = stopwords.union(newStopWords)
    TotalText = " ".join(TotalText)

    #tokenization
    tokens = [
        w for w in word_tokenize(TotalText.lower()) if w.isalpha()
    ]  # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase
    no_stop = [
        t.strip() for t in tokens if t.strip() not in stopwords
    ]  # stopwords already comes with a built-in list of words to remove
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop]

    return lemmatized

Пример #9

0

Показать файл

Файл: n1_tag100_mongo.py Проект: jc-chuang/Movie_project

import nltk
import math
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from collections import Counter
import time

dirpath = './reviews_summary_ALL_txt'  ###資料夾###
# f_list = os.listdir(dirpath)
f_list = ['tt1201607', 'tt0111161']
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
#額外自訂義停用字
stopwords = stopwords.union({
    'movie', 'film', 'time', 'ha', 'wa', 'dont', 'much', 'thing', 'many',
    'watch', 'thats'
})


#前處理
def my_tokenizer(s):
    s = s.lower()  # downcase                           #建立{符號:None}字典
    remove_punctuation_map = dict(
        (ord(char), None)
        for char in string.punctuation)  #string.punctuation=標點符號
    no_punctuation = s.translate(remove_punctuation_map)  #以字典移除標點符號
    tokens = nltk.tokenize.word_tokenize(no_punctuation)  # nltk斷字
    tokens = [t for t in tokens if len(t) > 2]  # 大於兩個字才要
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]  # 還原詞性
    tokens = [t for t in tokens if t not in stopwords]  # 移除停用字
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]  # 移除包含數字的字

Пример #10

0

Показать файл

import nltk
from nltk.corpus import stopwords

# Read input file
#User will enter the file name here

file = open('teddysou1908.txt')

a = file.read()

# Stopwords. These are the 250 most common words according to http://www.anglik.net/english250.htm
stopwords = set(line.strip() for line in open('trumpsou2017.txt'))

stopwords = set(stopwords.words('english'))
"""
stopwords.union(set(['mr','mrs','one','two','said','the','of','to','and','a','in','is','it',
'you','that','he','was','for','on','are','with','as','i','his','they','be','at','one','have','this','from','or',
'had','by','hot','but','some','what','there','we','can','out','other','were','all','your','when','up','use','word',
'how','said','an','each','she','which','do','their','time','if','will','way','about','many','then','them','would',
'write','like','so','these','her','long','make','thing','see','him','two','has','look','more','day','could','go',
'come','did','my','sound','no','most','number','who','over','know','water','than','call','first','people','may',
'down','side','been','now','find','any','new','work','part','take','get','place','made','live','where','after',
'back','little','only','round','man','year','came','show','every','good','me','give','our','under','name','very',
'through','just','form','much','great','think','say','help','low','line','before','turn','cause','same','mean',
'differ','move','right','boy','old','too','does','tell','sentence','set','three','want','air','well','also','play',
'small','end','put','home','read','hand','port','large','spell','add','even','land','here','must','big','high','such',
'follow','act','why','ask','men','change','went','light','kind','off','need','house','picture','try','us','again','animal',
'point','mother','world','near','build','self','earth','father','head','stand','own','page','should','country','found','answer',
'school','grow','study','still','learn','plant','cover','food','sun','four','thought','let','keep','eye','never','last','door',
'between','city','tree','cross','since','hard','start','might','story','saw','far','sea','draw','left','late','run',"don't",

Пример #11

0

Показать файл

#Criação de nova lista de palavras irrelevantes
nova_stopwords = [
    '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'q', 'w', 'e', 'r', 't',
    'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z',
    'x', 'v', 'b', 'n', 'm', 'da', 'de', 'com', 'é', 'então', 'ja', 'já',
    'sao', 'ai', 'vao', 'so', 'acho', 'até', 'daqui', 'dessa', 'desse',
    'dessas', 'desses', 'assim', 'ia', 'tão', 'devem', 'fica', 'ficou', 'la',
    'lá', 'ate', 'até', 'desde', 'só', 'pra', 'há', 'ha', 'hà', 'são', 'só',
    'já', 'deixou', 'aí', 'sobre', 'que', 'durante', 'vai', 'dia', 'ainda',
    'estão', 'deu', 'dar', 'para', 'r', 'o', 'e', 'após', 'sr', 'sra', 'tudo',
    'q', 'tão', 'sendo', 'sem', 'me', 'as', 'os', 'isso', 'mas', 'quase',
    'estar', 'ta', 'tá', 'ai', 'vão', 'lá', 'vá', 'tô'
]

#União de lista padrão do StopWords com a nova lista
nova_stopwords_list = stopwords.union(nova_stopwords)

#Instancias para remoção de padrões irrelevantes
nolink = r"http\S+"
caracters = r"[^@#_a-záéíóúàèìòùâêîôûãõçA-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÃÕÇ0-9 ]"
nospaces = r'\s+'

#Criação do arquivo onde serão armazenados os dados transformados
saida = open('C:/users/onlyone/desktop/prefeito/prefeito.txt',
             mode='w',
             encoding='UTF-8')

#Laço para executar a instrução em toodas as postagens
for page in tweepy.Cursor(api.user_timeline,
                          screen_name=usuario.screen_name,
                          count=200,

Пример #12

0

Показать файл

Файл: top10_words.py Проект: laranea/Sentiment-Analysis-and-Topic-Modelling


# In[14]:


import collections
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# Read input file, note the encoding is specified here 
# It may be different in your text file
a= s
# Stopwords
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
stopwords = stopwords.union(set(['mr','mrs','one','two','said','phone','amazon','iphone','product','apple','vivo','redmi','lenovo','x',' ']))
# Instantiate a dictionary, and for every word in the file, 
# Add to the dictionary if it doesn't exist. If it does, increase the count.
wordcount = {}
# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
for word in a.lower().split():
    word = word.replace(".","")
    word = word.replace(",","")
    word = word.replace(":","")
    word = word.replace("\"","")
    word = word.replace("!","")
    word = word.replace("â€œ","")
    word = word.replace("â€˜","")
    word = word.replace("*","")
    if word not in stopwords:
        if word not in wordcount:

Пример #13

0

Показать файл

random_seed = 3613
test_percentage = 0.20

mongo_user = urllib.parse.quote_plus(mcred.USERNAME)
mongo_pass = urllib.parse.quote_plus(mcred.PASSWORD)

stopwords = set(stopwords.words('english'))

punctuation_list = {'.', ',', '?', '!', '\'', '"', ':', ';', '-', '–'}
special_list = {
    '`', '~', '@', '#', '$', '%', '^', '&', '+', '*', '/', '=', '>', '<', '(',
    ')', '{', '}', '[', ']', '|', '\\'
}
other_sym_list = {'...', '…', '’', '..', '“', '”'}

stop_url_symbol_list = stopwords.union(punctuation_list).union(
    special_list).union(other_sym_list).union({'#url'})


class CustomTweetTokenizer(TweetTokenizer):
    def __init__(self,
                 preserve_case=True,
                 reduce_len=False,
                 strip_handles=False,
                 convert_urls=True,
                 remove_stopwords=False):
        super().__init__(preserve_case=preserve_case,
                         reduce_len=reduce_len,
                         strip_handles=strip_handles)
        self.convert_urls = convert_urls
        self.remove_stopwords = remove_stopwords

Пример #14

0

Показать файл

import scipy.io
import select
import shutil
import sys
import time
import tensorflow as tf

from embedding_evaluation import write_embedding_to_file, evaluate, EmbeddingTaskEvaluator
from gensim_utils import batch_generator, batch_generator2
from tensor_embedding import PMIGatherer, PpmiSvdEmbedding
from tensor_decomp import CPDecomp, SymmetricCPDecomp, JointSymmetricCPDecomp
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
grammar_stopwords = {',', "''", '``', '.', 'the'}
stopwords = stopwords.union(grammar_stopwords)


class GensimSandbox(object):
    def __init__(self, method, embedding_dim, num_sents, min_count, gpu=True):
        self.method = method
        self.embedding_dim = int(embedding_dim)
        self.min_count = int(min_count)
        self.num_sents = int(num_sents)
        self.gpu = gpu
        if '--buildvocab' in sys.argv:
            self.buildvocab = True
        else:
            self.buildvocab = False

        # To be assigned later

Пример #15

0

Показать файл

Файл: SVD.py Проект: DiptoChakrabarty/NLP

import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

lemm = WordNetLemmatizer()

titles = [line.rstrip() for line in open('book_title.txt')]

stopwords = set(w.rstrip() for w in stopwords.words('english'))

stopwords = stopwords.union({
    'introduction', 'edition', 'series', 'approach', 'card', 'access',
    'application', 'package', 'brief', 'vol', 'fundamental', 'second', 'third',
    'fourth', 'first', 'guide', 'essential', 'print'
})


def tokenize_words(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 3]  # remove short words
    tokens = [lemm.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
    return tokens


word_index_map = {}

Пример #16

0

Показать файл

Файл: Ireland_Analysis_210127_jupyter.py Проект: lisbonne21/IIPE-data

import matplotlib.pyplot as plt
import nltk, re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
stopwords = set(nltk.corpus.stopwords.words('english'))

TotalText = []
for index, row in df_FilesProperlyConverted.iterrows():
    text = row['Text']
    TotalText.append(text)

newStopWords = ['school','learning','student','pupil','teacher','management','teaching','support', 'lesson', 'board']
stopwords = stopwords.union(newStopWords)
TotalText = " ".join(TotalText)
tokens = [w for w in word_tokenize(TotalText.lower()) if w.isalpha()]          # isalpha() checks if each word is alphabetical, lower() transforms everything to lowercase
no_stop = [t.strip() for t in tokens if t.strip() not in stopwords]      # stopwords already comes with a built-in list of words to remove
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stop]
bow = Counter(lemmatized)
MostCommon = dict(bow.most_common(10))

plt.bar(*zip(*MostCommon.items()))
plt.title('Whole sample')
plt.xlabel('Most common words')
plt.ylabel('Number of times the word appears')
plt.xticks(rotation='vertical')
plt.savefig("Results\\Word count\\Whole sample.png")
plt.show()

Пример #17

0

Показать файл

Файл: clusterviz.py Проект: knchanu/DiscussionAboutMassShootings

import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from wordcloud import WordCloud
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from nltk import word_tokenize
import pandas as pd
import json
import os
import re

VEC_PATH = '/Users/anthonysicilia/Desktop/GoogleNews-vectors-negative300.bin'
stopwords = set(stopwords.words('english'))
stopwords = stopwords.union(set([word.strip() for word in open('stopwords.txt')]))
vectors = KeyedVectors.load_word2vec_format(datapath(VEC_PATH), binary=True)
comments = dict()
for comment_path in os.listdir('comments/'):

    with open('comments/' + comment_path) as f:
            
            try:
                x = json.load(f)
            except:
                print('Error Loading File.')
                exit()
            try:
                x = x['comments']
            except:
                print('Expected "comment" field. Field not found.')
                exit()

Пример #18

0

Показать файл

soup = get_soup(bp_transcripts)

alltxt = get_text(soup)

alltxt = filter_bolsonaro(alltxt)

text = "".join(alltxt)

text = punctuation_stop(text)

new_words = []
with open("brazilianwords.txt", 'r', encoding='utf-8') as f:
    [new_words.append(word) for line in f for word in line.split()]

new_stopwords = stopwords.union(new_words)

text = ' '.join(text)

wc = WordCloud(background_color="white",
               width=1600,
               height=800,
               max_words=100,
               max_font_size=200,
               min_font_size=1,
               stopwords=new_stopwords)
wc.generate(text)

plt.figure(figsize=[20, 20])
plt.imshow(wc)
plt.axis('off')

Пример #19

0

Показать файл

Файл: cowords.py Проект: askemottelson/tochi-interaction

    proceedings = get_proceedings(min_year=1980, max_year=2019)

    keywords = set(["embodied", "embody", "body", "bodies"])
    cnt = Counter()
    nouns = Counter()

    for paper in proceedings:

        sentences = sent_tokenize(paper.clean_text)

        for sentence in sentences:
            words = e(sentence).split()

            for word in words:
                if word in keywords:
                    r = [wordnet_lemmatizer.lemmatize(w) for w in words if w not in stopwords.union(keywords)]
                    cnt.update(r)
                    nouns.update([x for x in r if x in all_nouns])
                    break

    for w,c in cnt.most_common(500):
        print w,c
    print 70 * "*"
    for w,c in nouns.most_common(500):
        print w,c

    # Generate a word cloud image
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt

    # lower max_font_size

Пример #20

0

Показать файл

Файл: TF-IDF.py Проект: son50git/PKT_009_NLP_Prac

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from nltk import corpus
import numpy

# importing the corpus into data variable
data = corpus.brown

# Using Porter Stemmer
stemmer = PorterStemmer()

# Building the list of stop words
# We will filter the tokens against it
stopwords = set(stopwords.words('english'))
stopwords = stopwords.union(string.punctuation)

# Limiting the number of files we will use
# uncomment the following line to use all of the
# corpus files
# fileIds = data.fileids()
fileids = data.fileids()[:30]

idf_matrix = []
dictionary = dict()

# total count f words in the corpus
words_count = 0

# total count of document in the corpus
documents_count = len(fileids)

Пример #21

0

Показать файл

Файл: tf_wordcloud.py Проект: dipika-d/Natural-Language-Processing-Python

from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

#sets working directory
os.chdir("/home/allomorph/Downloads")

#ingesting and reading the PDF of the textbook
intext = open("Keith Johnson-Acoustic and auditory phonetics (Kindle friendly).pdf",'rb')
read_text = PyPDF2.PdfFileReader(intext)
num_pages = read_text.getNumPages() #prints the number of pages
#print(num_pages)

#building a stopword set including punctuations
stopwords = set(stopwords.words('english'))
stopwords = stopwords.union(punctuation)

#calling the lemmatizer function
wnl = WordNetLemmatizer()

#building a lexicon of words from the textbook by the variable instances
instances = []
for i in range(0,num_pages):
	page = read_text.getPage(i)
	content = page.extractText()
	content.encode('utf-8')
	tokens = word_tokenize(content)
	filtered_tokens = [word.lower() for word in tokens if word not in stopwords]
	for instance in filtered_tokens:
		instances.append(instance)

Python union примеры использования