def removeStopWords(corpus, stopwords): ''' Removes the stopwords from the corpus and returns the corpus without stopwords. ''' stopwords = stopwords.split(',') parseList = [word for word in corpus.split() if word.lower() not in stopwords] return ' '.join(parseList)
def tokenizingAndStopwords(str): text = '' for c in str: if c in [ '.', '...', '?', '!', ':', ';', '&', ',', '"', '*', '(', ')', '[', ']', '{', '}', '#', '~', '_', '=', '+', '-', '/', '\\' ]: text += ' ' + c + ' ' else: text += c text = re.sub(' +', ' ', text) # replaces one or more spaces with single space text = text.lower().split(' ') mf = open('googleStopwords.txt', 'r') # use Google stopwords stopwords = mf.read() mf.close() stopwords = stopwords.split('\n') stopwords = stopwords + [ '&', '*', '(', ')', '[', ']', '{', '}', '#', '~', '_', '=', '+', '-', '\'', '\n' ] result = [] for word in text: if word not in stopwords: result.append(word) return result
def getStopWords(): stopwordsFile = open('/Users/305015992/pythonProjects/word2vecAnalysis/stopwords.txt', 'r') stopwords=stopwordsFile.read() #stopwords=stopwords.lower() stopwordList=stopwords.split(",") print(stopwordList) return(stopwordList)
def sentiment(stopwords, text): arr_stopwords = stopwords.split() lower_case = text.lower() cleaned_text = lower_case.translate( str.maketrans('', '', string.punctuation)) tokenized_words = word_tokenize(cleaned_text, "english") final_words = [] for word in tokenized_words: if word not in stopwords.words("english"): final_words.append(word) emotion_list = [] with open('emotions.txt', 'r') as file: for line in file: clear_line = line.replace('\n', '').replace(',', '').replace("'", '').strip() word, emotion = clear_line.split(':') if word in final_words: emotion_list.append(emotion) sentiment_analysis(cleaned_text) return emotion_list
def clean_words(job_type_list,stopwords): from nltk.corpus import stopwords stopwords = ' '.join(stopwords) stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower() stopwords = stopwords.split(' ') stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering']) special_chars = ['--','...','\n','•','®','●','\n'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] return resultwords
def imdb_data_preprocess(inpath, outpath="./", name="imdb_tr.csv", mix=False): import pandas as pd from pandas import DataFrame, read_csv import os import csv import numpy as np stopwords = open("stopwords.en.txt", 'r' , encoding="ISO-8859-1").read() stopwords = stopwords.split("\n") indices = [] text = [] rating = [] i = 0 for filename in os.listdir(inpath+"pos"): data = open(inpath+"pos/"+filename, 'r' , encoding="ISO-8859-1").read() data = remove_stopwords(data, stopwords) indices.append(i) text.append(data) rating.append("1") i = i + 1 for filename in os.listdir(inpath+"neg"): data = open(inpath+"neg/"+filename, 'r' , encoding="ISO-8859-1").read() data = remove_stopwords(data, stopwords) indices.append(i) text.append(data) rating.append("0") i = i + 1 Dataset = list(zip(indices,text,rating)) if mix: np.random.shuffle(Dataset) df = pd.DataFrame(data = Dataset, columns=['row_Number', 'text', 'polarity']) df.to_csv(outpath+name, index=False, header=True) pass
ip_rev_string = re.sub("[^A-Za-z" "]+", " ", ip_rev_string).lower() ip_rev_string = re.sub("[0-9" "]+", " ", ip_rev_string) # In[7]: # words that contained in iphone 7 reviews ip_reviews_words = ip_rev_string.split(" ") # In[8]: #stop_words = stopwords.words('english') with open("Downloads\\stop.txt", "r") as sw: stopwords = sw.read() stopwords = stopwords.split("\n") # In[9]: #stp_wrds = stopwords+stop_words temp = ["this", "is", "awsome", "Data", "Science"] [i for i in temp if i not in "is"] ip_reviews_words = [w for w in ip_reviews_words if not w in stopwords] # In[10]: # Joinining all the reviews into single paragraph ip_rev_string = " ".join(ip_reviews_words)
def ReadStopWordsFile(): stopwordslist = [] stopwordsFile = open("stopwords", "r") stopwords = stopwordsFile.read() stopwords = stopwords.split('\n') return stopwords
a = turn_dict_into_list(global_ds_jobs_descriptions) len(a) #################################### #### Stopwords ####### #################################### import nltk from nltk.corpus import stopwords stopwords = nltk.corpus.stopwords.words('english') stopwords = ' '.join(stopwords) stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower() stopwords = stopwords.split(' ') stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering']) #################################### #### Top Single words ####### #################################### def clean_words(job_type_list): special_chars = ['--','...','\n','•','®','●','\n'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] return resultwords
# import modules & set up logging import gensim, logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #sentences = [['first', 'sentence'], ['second', 'sentence']] # train word2vec on the two sentences #reading the sentneces using the line by line method versus reading from the individual files has different computation times sentenceIterator = gensim.models.word2vec.LineSentence("all_cases.txt") #stopwords stopwordsFile = open('stopwords.txt', 'r') stopwords = stopwordsFile.read() stopwordList = stopwords.split(",") ##this is used when we read the file sentences = [] for sentence in sentenceIterator: sentences.append(PreprocessDoc2Vec(" ".join(sentence), stopwordList)) #sentences.append(PreprocessDoc2Vec(sentence,stoplist)) ###if you want bigrams bigram_transformer = gensim.models.Phrases(sentences) model = gensim.models.Word2Vec(bigram_transformer[sentences], min_count=10, size=100, sg=1, hs=1)
def get_custom_stopwords(stop_words_file): with open(stop_words_file,'r' , encoding='gbk' , newline= '') as f: stopwords = f.read() stopwords_list = stopwords.split('\n') custom_stopwords_list = [i.replace('\r','') for i in stopwords_list] return custom_stopwords_list
def scrape_job_text(url): #a = scrape_job_text('https://www.google.com/search?q=data+scientist&ibp=htl;jobs#fpstate=tldetail&htichips=job_family_1:data%20scientist,job_family_1:data%20science,city:O13QqUW2j4Ciw3zdJvuNdg%3D%3D&htidocid=gPQbEIqS6gcFH9aTAAAAAA%3D%3D&htilrad=24.1401&htischips=job_family_1;data%20scientist;data%20science,city;O13QqUW2j4Ciw3zdJvuNdg%3D%3D:Sunnyvale&htivrt=jobs') #[item for sublist in a for item in sublist] for url in url_list: try: master_dict[url] = global_jobs_descriptions[url] except: master_dict[url] = scrape_job_text(url) global_jobs_descriptions[url] = master_dict[url] np.save(file_name, global_jobs_descriptions) def turn_dict_into_list(global_jobs_descriptions): master_list = list(global_jobs_descriptions.values()) flat_list = [item for sublist in master_list for item in sublist if 'data' in item and 'the' in item and len(item)>1000] print(len(flat_list)) master_job_text = list(set(flat_list)) print(len(master_job_text)) return master_job_text a = turn_dict_into_list(global_de_jobs_descriptions) len(a) #################################### #### Stopwords ####### #################################### import nltk from nltk.corpus import stopwords stopwords = nltk.corpus.stopwords.words('english') stopwords = ' '.join(stopwords) stopwords = stopwords.translate(str.maketrans('','',string.punctuation)).lower() stopwords = stopwords.split(' ') stopwords.extend(['food','restaurant','get','place','really','menu','also','one','got','two','us','around','san','francisco','sf','','la','order','ordered','eat','good','come','first','go','even','would','hour','well','time','way','spot','like','make','worth','back','never','seven','close','back','etc','using','including','use',"you'll",'·','job','qualifications','plus','experience','work','working','scientist','science','company','skills','eg','equal','scientists','role','industry','data','engeineer','engineering']) #################################### #### Top Single words ####### #################################### def clean_words(job_type_list): special_chars = ['--','...','\n','•','®','●','\n'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] return resultwords def top_words_counter(resultwords,num_reviews): counts = Counter(resultwords) my_dict = dict(counts) sorted_x = sorted(my_dict.items(), key=operator.itemgetter(1),reverse=True) try: return (sorted_x[0:num_reviews]) except: return("Not enough words") def percentage_word(job_type_list,word): num_appear = sum([1 for i in job_type_list if word.lower() in i.lower()]) total = len(job_type_list) return round((num_appear/total)*100,2) percentage_word(a,'scala') def compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,word): ds = percentage_word(ds_all_jobs_text,word) de = percentage_word(de_all_jobs_text,word) print(word + ": " + str(ds) + "% DS, " + str(de) + "% DE \n") return [word,ds,de] ##################################################### ####Define variables that contain raw job text####### ##################################################### #ds_all_jobs_text = create_raw_job_text(data_science_url_list) ds_all_jobs_text = create_raw_job_dict(global_ds_jobs_descriptions,data_science_url_list,'data_science_job_listings.npy') de_all_jobs_text = create_raw_job_dict(global_de_jobs_descriptions,data_engineer_url_list,'data_engineer_job_listings.npy') ##################################################### ####Clean words ##################################### ##################################################### ds_all_jobs_text_c = clean_words(ds_all_jobs_text) #print(ds_all_jobs_text) de_all_jobs_text_c = clean_words(de_all_jobs_text) ##################################################### ####Define DS variables that contain raw job text####### ##################################################### ds_top_100 = top_words_counter(ds_all_jobs_text_c,100) print(ds_top_100) de_top_100 = top_words_counter(de_all_jobs_text_c,100) print(de_top_100) ##################################################### ####Define DE variables that contain raw job text####### ##################################################### de_top_100 = top_words_counter(ds_all_jobs_text,100) compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'machine learning') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'statistic') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'analysis') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'artificial intelligence') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'predictive modeling') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'pipeline') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'big data') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'infrastructure') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'aws') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'ETL') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'kafka') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'scala') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'spark') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'hive') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'SQL') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'noSQL') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'hadoop') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'airflow') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'aws') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'redshift') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'EC2') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'airflow') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'scikit') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'tensorflow') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'cloud') compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,'o') print(percentage_word(ds_all_jobs_text,"data")) print(top_words_counter(ds_all_jobs_text_c,100)) z = top_words_counter(de_all_jobs_text_c,100) top_100_list = [i[0] for i in z] print(top_100_list) top_100_comparison = [compare_percentage_ds_de(ds_all_jobs_text,de_all_jobs_text,word) for word in top_100_list] print(top_100_comparison) ##################################################### ####Both top lists job text####### ##################################################### common_words = list(set(ds_top_100)&set(de_top_100)) print(len(common_words)) ##################################################### ####Percentage word appears analysis ################ ##################################################### print(percentage_word(ds_all_jobs_text,"data")) print(percentage_word(de_all_jobs_text,"data")) print(percentage_word(ds_all_jobs_text,"python")) print(percentage_word(de_all_jobs_text,"python")) print(percentage_word(ds_all_jobs_text,"spark")) print(percentage_word(de_all_jobs_text,"spark")) from nltk import word_tokenize from nltk.collocations import BigramCollocationFinder from nltk.collocations import TrigramCollocationFinder from nltk.collocations import QuadgramCollocationFinder string.punctuation += "’" def top_words_bicounter(job_type_list): special_chars = ['--','...','\n','•','®','·'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] #text = ' '.join(resultwords) a finder = BigramCollocationFinder.from_words(word_tokenize(a)) l = [] for k,v in finder.ngram_fd.items(): #count += 1 z = (k,v) l.append(z) l = sorted(l,key=itemgetter(1),reverse=True) return(l[0:300]) top_words_bicounter(job_text) def top_words_tricounter(job_type_list): special_chars = ['--','...','\n','•','®','·'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] #text = ' '.join(resultwords) a finder = TrigramCollocationFinder.from_words(word_tokenize(a)) l = [] for k,v in finder.ngram_fd.items(): #count += 1 z = (k,v) l.append(z) l = sorted(l,key=itemgetter(1),reverse=True) return(l[0:300]) top_words_tricounter(job_text) def top_words_quadcounter(job_type_list): special_chars = ['--','...','\n','•','®','·'] a = ' '.join(job_type_list) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space #resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] #text = ' '.join(resultwords) a finder = QuadgramCollocationFinder.from_words(word_tokenize(a)) l = [] for k,v in finder.ngram_fd.items(): #count += 1 z = (k,v) l.append(z) l = sorted(l,key=itemgetter(1),reverse=True) return(l[0:300]) top_words_quadcounter(job_text) special_chars = ['--','...','\n','•','®'] a = ' '.join(job_text) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case for char in special_chars: a = a.replace(char, ' ') #replace special char with a space resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] text = ' '.join(resultwords) finder = BigramCollocationFinder.from_words(word_tokenize(text)) for k,v in finder.ngram_fd.items(): print(k,v) ##deep copy. save a copy. a = ' '.join(job_text) a = a.translate(str.maketrans('','',string.punctuation)).lower() #remove punctuation and make lower case a = a.replace('\n', ' ') #replace \n with a space a = a.replace('•', ' ') resultwords = [word for word in a.split(' ') if word.lower() not in stopwords] flat_list = [item for sublist in all_job_text for item in sublist] flat_list.split() a = ''.join(flat_list) sentence = a.split() #turn into a list from collections import Counter counts = Counter(a) counts.most_common(10) sentence = a sentence counts = Counter(sentence) dict(counts.most_common(30)) #str.replace(“\n”, “”) ##Google selenium locate element by xpath, two attributes ##read more ####LOOK FOR READ MORE BUTTOM with open("data_science_jobs_raw_text.txt","rb") as fp: job_text = pickle.load(fp)