def load_custom_stoplist(self, stoplist_file): """Load custom stoplist.""" with open(stoplist_file, 'r') as f: stoplist = f.read().split('\n') for item in stoplist: STOP_WORDS.add(item) self.nlp.vocab[item].is_stop = True
def _collect_words(self): """Collects all the unique word and pos_tag pairs from the text.""" nlp = spacy.load("en_core_web_lg") # coref = NeuralCoref(nlp.vocab) # nlp.add_pipe(coref, name='neuralcoref') print("Preparing Spacy object") nlp.max_length = len(self.text) text_obj = nlp(str(self.text.lower()), disable=['NER']) print("Preparing Spacy object") # Resolve co-reference using neuralcoref # self.text = text_obj._.coref_resolved # nlp.remove_pipe("neuralcoref") # text_obj = nlp(str(self.text.lower()), disable=['NER']) prev_sent = Sentence(nlp(''), None) words = {} STOP_WORDS.add('_') logging.info("Collecting words") for sent in tqdm(text_obj.sents): # sent = nlp(Sentence.clean_sentence(sent.text)) curr_sent = Sentence(sent, prev_sent) for token in sent: if token.text in STOP_WORDS or\ token.pos_ in ['PART', 'PUNCT', 'SPACE', 'NUM', 'SYM']: continue key = token.text.strip() + ' ; ' + token.tag_ if key not in words: words[key] = Word(token) words[key].include_sentence(curr_sent) return words
def spacy_adder(self, model, verbose=False): for stopword in self.vocab_list: STOP_WORDS.add(stopword) model.vocab.add_flag( lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) if verbose: print( f"Complete. There are {len(self.vocab_list)} stop words in the list." )
def words_stop(): words_stop._log.debug("\nThe outcomes of words stop are:") from spacy.lang.en.stop_words import STOP_WORDS # print (STOP_WORDS) STOP_WORDS.add("your_additional_stop_word_here") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"} # 单个词可以直接.add() nlp.Defaults.stop_words -= {"嗯"} # 单个词可以直接.remove() for word in nlp.Defaults.stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True words_stop._log.debug(nlp.Defaults.stop_words)
def construct_stop_words(): """ Update the spacy stopwords list :return: """ stop_words_list = [ "uk", "ceo", "apple", "wal", "st", "q1", "q2", "q3", "q4", "bp", "wednesday", "tuesday", "monday", "thursday", "friday", "sept", "johnson", "inc", "david", "amazon.com" ] for words in stop_words_list: STOP_WORDS.add(words) return STOP_WORDS
def summarization(): with open("./stories/d3370f0d60746aebcc5f61a068805b8545357e6f.story", "r", encoding="utf-8") as f: text = " ".join(f.readlines()) core = en_core_web_sm.load() doc = core(text) # clean sentences corpus = [sent.text.lower() for sent in doc.sents] STOP_WORDS.add("@highlight") cv = CountVectorizer(stop_words=list(STOP_WORDS)) cv_fit = cv.fit_transform(corpus) word_list = cv.get_feature_names() count_list = cv_fit.toarray().sum(axis=0) # zip it in a way that pair word and the its count word_frequency = dict(zip(word_list, count_list)) words_freqs = sorted(word_frequency.values()) higher_word_frequencies = [word for word, freq in word_frequency.items() if freq in words_freqs[-3:]] print("higher frequency words : ", higher_word_frequencies) higher_frequency = words_freqs[-1] # normalise the frequencies values for word in word_frequency.keys(): word_frequency[word] = (word_frequency[word]/higher_frequency) sentence_rank = {} for sent in doc.sents: for word in sent: if word.text.lower() in word_frequency.keys(): if sent in sentence_rank.keys(): sentence_rank[sent] += word_frequency[word.text.lower()] else: sentence_rank[sent] = word_frequency[word.text.lower()] else: continue # fetch top sentences which have the higher top-freq words top_sentences = (sorted(sentence_rank.values())[::-1]) top_sent = top_sentences[:3] summary = [] for sent, strength in sentence_rank.items(): if strength in top_sent: summary.append(sent) return text, summary
def Stop(): print("\nThe outcomes of Stop Words are:") from spacy.lang.en.stop_words import STOP_WORDS # print (STOP_WORDS) STOP_WORDS.add("your_additional_stop_word_here") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True # print (lexeme.text) nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"} # 单个词可以直接.add() nlp.Defaults.stop_words -= {"嗯"} # 单个词可以直接.remove() for word in nlp.Defaults.stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True # print (lexeme.text) print(nlp.Defaults.stop_words)
def preprocess_text(author_df): nlp = spacy.load('en') STOP_WORDS.add("'s") STOP_WORDS.add('the') STOP_WORDS.add('a') for word in STOP_WORDS: nlp.vocab[word].is_stop = True doc = author_df.text.apply(nlp) # remove stop words and punctuations clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop]) author_df['text_cleaned'] = doc.apply(clean_and_lemmatize) # enteties author_df['text_with_entities'] = doc.apply(replace_ents) # pos-tag pairs author_df['text_pos_tag_pairs'] = author_df['text'].apply(lambda row: pos_tag_pairs_sentence(row)) # additional nlp meta features author_df['polarity_of_text'] = author_df['text'].apply(lambda row: get_polarity(row)) author_df['punct_cnt'] = doc.apply(lambda x: len([t for t in x if t.is_punct])) author_df['words_cnt'] = doc.apply(lambda x: len([t for t in x if not t.is_punct])) author_df['ents_cnt'] = doc.apply(lambda x: len(x.ents)) author_df['noun_chunks_cnt'] = doc.apply(lambda x: len(list(x.noun_chunks))) author_df['fraction_noun'] = author_df['text'].apply(lambda row: fraction_noun(row)) author_df['fraction_adj'] = author_df['text'].apply(lambda row: fraction_adj(row)) author_df['fraction_verbs'] = author_df['text'].apply(lambda row: fraction_verbs(row)) return author_df
def detectTextIn(self, Text): classFromText = [] classFromText.append(Text) # Text=Text.lower() nlp = spacy.load('en_core_web_sm') # Adding Custom stop words STOP_WORDS.add("picture") STOP_WORDS.add("image") STOP_WORDS.add("images") STOP_WORDS.add("pics") STOP_WORDS.add("portrait") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True uni_string = str(Text) doc = nlp(uni_string) for ent in doc.ents: classFromText.append(ent.label_) Text = Text.lower() uni_string = str(Text) doc = nlp(uni_string) for token in doc: # """token.text, token.lemma_, token.pos_, token.tag_, token.dep_, # token.shape_, token.is_alpha, token.is_stop""" if not token.is_stop: classFromText.append(token.lemma_) classFromText.append(token.text) classFromText = [a.lower() for a in classFromText] for text in classFromText: if text == "": classFromText.remove(text) classFromText = set(classFromText) return classFromText
nlp.vocab['better'].is_stop #will return False #filtering the stopwords ex1 = nlp("How do I keep looping through until the len(new_list) = len(data_list) (i.e. all the numbers are in the new list) with everything sorted without using the built in max, min, sort functions? I'm not sure if it's necessary to create a new list either.") for word in ex1: if word.is_stop: print(word) #another way mylist = [word for word in ex1 if word.is_stop] #adding/removing stopwords print(nlp.vocab['lamao'].is_stop) STOP_WORDS.add('lol') print(nlp.vocab['lol'].is_stop) STOP_WORDS.remove('lol') print(nlp.vocab['lol'].is_stop) ######################################################## docs = nlp('Aditya went to the Tajmahal in the Agra and ate icecream there') for token in docs.noun_chunks: print(token.text) #it wll print 'the' for token in docs.noun_chunks: print(token.root.text) #it will print with the
import spacy from spacy.lang.en.stop_words import STOP_WORDS nlp = spacy.load('en_core_web_md') domain_stop_words = ['chapter', '<', '>', ';', 'vinegar', 'of', '%'] for word in domain_stop_words: STOP_WORDS.add(word) STOP_WORDS1 = STOP_WORDS.copy() STOP_WORDS1.discard('other') def nlp0(sentence): sentence = sentence.lower() word_list = [ token.lemma_ for token in nlp(sentence) if not token.is_stop and not token.is_punct ] return word_list def nlp1(sentence): sentence = sentence.lower() word_list = [ str(token.lemma_) for token in nlp(sentence) if str(token) not in STOP_WORDS1 and not token.is_punct ] word_list1 = [] flag = 0 for i in word_list: if i == 'other':
def updateStopWords(): '''this function is used to update the stop words corpus''' # adding couple of stop words STOP_WORDS.add("i'm") STOP_WORDS.add("isn't") STOP_WORDS.add("let's") STOP_WORDS.add("ha") STOP_WORDS.add("according") STOP_WORDS.add("want") STOP_WORDS.add("like")
doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return pd.Series(texts_out) """Remove Stop Words""" data_words_nostops = remove_stopwords(data_words)# Remove Stop Words """ Form Bigrams""" data_words_bigrams = make_bigrams(data_words_nostops) """ Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en""" nlp = spacy.load('en', disable=['parser', 'ner']) """Do lemmatization keeping only noun, adj, vb, adv""" data_lemmas = lemmatization(data_words_bigrams, allowed_postags=['NOUN' ]) #, 'VERB', 'ADV', 'ADJ', """Remove more Stop Words, find ways to remove a long list of SWS, not one, this only adds one sw""" STOP_WORDS.add("artificial_intelligence") data_lemmatized = remove_stopwords(data_lemmas)# Rem """4. Create the Dictionary and Corpus and BOW # 4.1 Create Dictionary""" id2word = corpora.Dictionary(data_lemmatized) """size of dictionary""" print("Found {} words.".format(len(id2word.values()))) """ corpus""" corpus = [id2word.doc2bow(text) for text in data_lemmatized] """check sparsity, for instance, sparse = .99 tokens which are missing from more than 99 of the documents in the corpus. """ data_dense = gensim.matutils.corpus2dense(corpus, num_docs=len(corpus), num_terms= len(id2word.values())) print("Sparsicity: ", 1.0 - (data_dense > 0).sum() / data_dense.size) """
if __name__ == '__main__': if len(sys.argv) != 3: print('Usage: ') print('\tpython nmf.py SUBREDDIT_NAME NUM_TOPICS') sys.exit(1) # Disable tagger, parser and named-entity recognition nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) # Load custom stoplist with open('../../data/stoplist.txt', 'r') as f: stops = f.read().split() for stop in stops: STOP_WORDS.add(stop) for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True # Read data. DATA_FILE = '../pmf/NeutralPolitics.csv' #'../data/bigquery/2017/11-12/' + sys.argv[1] + '.csv' data = pd.read_csv(DATA_FILE) data = data.iloc[:, 0].fillna('').astype(str).squeeze() # FIXME change 0 to 1 print('Loaded Reddit comments.') # Disregard the bottom 70% of all comments, by simple count of split tokens. counts = data.apply(lambda s: len(s.split())) threshold = counts.quantile(0.7) data = data[counts > threshold]
if ent.text not in STOP_WORDS and ent.label_ in tracked_entities: anon_comment = anon_comment.replace(ent.text, ent.label_) return name_placeholder(anon_comment) except: print(text) pass #let's hold off on whitespacing: #def paddingFunc(text): # text = re.sub('([.,!?()])', r' \1 ', text) # text = re.sub('\s{2,}', ' ', text) # return text #original entity replacement code for reference '''# -*- coding: utf-8 -*- import pandas as pd import spacy from spacy.lang.en.stop_words import STOP_WORDS from spacy.vocab import Vocab import pandas as pd import numpy as np import os import xx_ent_wiki_sm nlp = spacy.load('en_core_web_sm') #python -m spacy download xx for multilingual processing nlp_multilingual = xx_ent_wiki_sm.load() from spacy.lang.en.stop_words import STOP_WORDS
def queryTokens(cadena, languages): cadena = __preprocessString(cadena) # remove non ascii-characters cadena = ''.join(i for i in cadena if ord(i) < 128) cadena = cadena.strip() # remove initial and end spaces word_tokens = word_tokenize(cadena) # Detect in which language the text is written lang = detect_language(word_tokens, languages) stop_words = set(stopwords.words(lang)) # Filtering stop words inverters = set([ 'dont', 'doesnt', 'havent', 'arent', 'didnt', 'wasnt', 'werent', 'not', 'never', 'hardly', 'seldom' ]) incrementers = set(['too', 'many', 'much', 'very', 'lots']) STOP_WORDS.add('im') STOP_WORDS.add('pm') STOP_WORDS.add('ai') STOP_WORDS.add('ie') STOP_WORDS.add('still') STOP_WORDS.add('cant') STOP_WORDS.add('isnt') STOP_WORDS.add('couldnt') STOP_WORDS.add('youre') STOP_WORDS.add('seen') STOP_WORDS.add('say') STOP_WORDS.add('says') STOP_WORDS.add('tell') STOP_WORDS.add('lot') STOP_WORDS.add('lol') STOP_WORDS.add('hes') STOP_WORDS.add('s') STOP_WORDS.add('be') filtered_sentence = [ w for w in word_tokens if not w in stop_words and not w in inverters and not w in incrementers and not w in STOP_WORDS ] # Checking not in stop_words return filtered_sentence
import pickle import spacy import re from spacy.lang.en.stop_words import STOP_WORDS nlp = spacy.load('en', disable=['parser']) CLASSIFIER_ROOT = 'classifiers/' TRANSFORMERS = ['transform_bag_of_words_0.sav', 'transform_bag_of_words_1.sav'] MODELS = ['nb.sav'] STOP_WORDS.add("'s") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True def load_model(model_name): with open('{0}{1}'.format(CLASSIFIER_ROOT, model_name), 'rb') as f: model = pickle.load(f) return model CLF_NB = load_model(MODELS[0]) TRANSFORMERS_MODELS = [load_model(TRANSFORMERS[0]), load_model(TRANSFORMERS[1])] def clean_html(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext.lower()
# code based on https://t.co/69FA0rkKUU import dns import spacy import argparse import json import numpy as np import pandas as pd from urllib.parse import quote from pymongo import MongoClient from collections import OrderedDict from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add('rt') STOP_WORDS.add('#') STOP_WORDS.add('%') STOP_WORDS.add('|') parser = argparse.ArgumentParser() parser.add_argument("user", help="Username for server access", type=str) parser.add_argument("password", help="Password for server access", type=str) parser.add_argument("server", help="Mongo DB server address", type=str) parser.add_argument("day", help='Day in twitter format. Example: "Wed May 06"', type=str) args = parser.parse_args() nlp = spacy.load('en_core_web_sm')
def LDA_Analysis(): #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb if 0 == 1: with open('data/review_text_all.txt','w') as myfile: myfile.write("") ''' loop through db and write jobs descriptions ''' with open('data/review_text_all.txt','a') as myfile: with Job() as db: a=0 max_ = int(db.getNoJobs()[0][0]) while (a < max_): #print(a) sample_review = db.readJobDetailClean(a)[0][1] if (sample_review != 'Json Error'): myfile.write(str(sample_review)+'\n') a += 1 #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt') if 0 == 1: with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f: for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'): f.write(sentence + '\n') unigram_sentences = LineSentence('data/unigram_sentences_all.txt') ''' for unigram_sentence in it.islice(unigram_sentences, 230, 240): print(u' '.join(unigram_sentence)) print(u'') ''' #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all') if 0 == 1: bigram_model = Phrases('data/unigram_sentences_all.txt') bigram_model.save('data/bigram_model_all') # load the finished model from disk bigram_model = Phrases.load('data/bigram_model_all') #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt') if 0 == 1: with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence('data/bigram_sentences_all.txt') ''' for bigram_sentence in it.islice(bigram_sentences, 230, 240): print(u' '.join(bigram_sentence)) print(u'') ''' #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all') if 0 == 1: trigram_model = Phrases(bigram_sentences) trigram_model.save('data/trigram_model_all') # load the finished model from disk trigram_model = Phrases.load('data/trigram_model_all') #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt') if 0 == 1: with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence('data/trigram_sentences_all.txt') ''' for trigram_sentence in it.islice(trigram_sentences, 230, 240): print(u' '.join(trigram_sentence)) print(u'') ''' #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt') if 0 == 1: import csv ''' Variant A: Use Stopwords 1) download StopWords.csv from MySQL table: KeyWords. 2) Remove all relevant words by hand ;) ''' with open('data/StopWords.csv', newline='') as csvfile: stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|') for words_ in stopwords_: #print(words_[0]) STOP_WORDS.add(words_[0]) #print(STOP_WORDS) ''' Varaint B: Use Dictionary ''' with open('data/Dictionary.csv', 'r', newline='') as csvfile: file_ = csv.reader(csvfile, delimiter=',', quotechar='"') dictionary_ = [] for row in file_: dictionary_.append(row[0]) #with open('file.csv', 'r') as f: #reader = csv.reader(f) #your_list = list(reader) with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f: for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4): # lemmatize the text, removing punctuation and whitespace unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)] # apply the first-order and second-order phrase models bigram_review = bigram_model[unigram_review] trigram_review = trigram_model[bigram_review] # remove any remaining stopwords ''' Variant A: ''' #trigram_review = [term for term in trigram_review # if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en' ''' Variant B: ''' trigram_review = [term for term in trigram_review if term in dictionary_]# # write the transformed review as a line in the new file trigram_review = u' '.join(trigram_review) f.write(trigram_review + '\n') ''' print(u'Original:' + u'\n') for review in it.islice(line_review('review_text_all.txt'), 11, 12): print(review) print(u'----' + u'\n') print(u'Transformed:' + u'\n') with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f: for review in it.islice(f, 11, 12): print(review) ''' #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict') if 0 == 1: trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,) trigram_dictionary.compactify() trigram_dictionary.save('data/trigram_dict_all.dict') # load the finished dictionary from disk trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm') if 0 == 1: # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt')) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm') #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all') if 0 == 1: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=15, id2word=trigram_dictionary, workers=1) lda.save('data/lda_model_all') # load the finished LDA model from disk lda = LdaMulticore.load('data/lda_model_all') #explore_topic(lda, topic_number=1) topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl') with open('data/topic_names.pkl', 'wb') as f: pickle.dump(topic_names, f) #load sameple_review from database #sample_review = get_sample_review(10) #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review) #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared') if 0 == 1: #term_ix = np.sort(topic_info.index.unique().values) LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open('data/ldavis_prepared', 'wb') as f: pickle.dump(LDAvis_prepared, f) ''' export LDA file ''' # load the pre-prepared pyLDAvis data from disk with open('data/ldavis_prepared', 'rb') as f: LDAvis_prepared = pickle.load(f) with open('data/DSJobs_LDA.html', 'w') as f: pyLDAvis.save_html(LDAvis_prepared, f)
import numpy as np from sklearn.feature_extraction.text import CountVectorizer pd.set_option('display.expand_frame_repr', False) np.random.seed(42) nlp = spacy.load('en_core_web_sm') # Combine spacy and linguistic utils stopwords stop_words = pd.read_csv( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words', header=None, squeeze=True) for stop_word in stop_words: STOP_WORDS.add(stop_word) # where your text files are text_path = Path('data/text') text_files = text_path.glob('*.txt') # where the clean version should go clean_path = Path('data/clean') if not clean_path.exists(): clean_path.mkdir(exist_ok=True, parents=True) for i, text_file in enumerate(text_files): if i % 100 == 0: print(i, end=' ', flush=True) doc = text_file.read_text() clean_doc = ' '.join([
import string import nltk try: stemmer = SnowballStemmer("english") except: nltk.download("wordnet") stemmer = SnowballStemmer("english") try: nlp = en_core_web_sm.load() except: os.system("python -m spacy download en_core_web_sm") nlp = en_core_web_sm.load() stop.add("phone") punc = set(string.punctuation) def clean(doc): # lower text and remove punctuations s = "" for char in doc.lower(): s += char if char not in punc else " " # remove stopwords, adjectives, and adverbs normalized = [] for token in nlp(s): if not ( token.is_space or token.is_stop
Windows:python -m spacy download en as Administrator Linux:sudo python -m spacy download en """ nlp = spacy.load('en') """#Exploring spaCy""" from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS f'There are {len(STOP_WORDS)} stopwords in spaCy' # You can add your own corpora specific STOPWORDS using the .add syntax STOP_WORDS.add("your_additional_stop_word_here") f'After adding your own stop words, spaCy will use {len(STOP_WORDS)} stopwords' doc = nlp("I am learning the most important ideas Natural Language Processing ideas using Python") print(doc) # doc is a spaCy object which stores the entire document string """**About spaCy objects**""" for token in doc: print(token) simplified_doc = [token for token in doc if not token.is_punct | token.is_stop] simplified_doc # please note that .orth_ attribute returns the unicode string representation of the token """We can also check what other things we know about these tags in the simplified_doc:"""
# glue job_df = pd.concat([jd_df, req_df], axis=1, ignore_index=True) job_df.columns = ["Req ID","Req Title", "Job Requisition Status", "Candidate ID", "Division", "Function", "Job Description"] job_df.head() ### Clean text ### # tokenize every text tokenizer = RegexpTokenizer(r'\w+') # remove numbers resume_df["Resume Text"].replace(r'[\d]','',regex=True, inplace=True) job_df["Job Description"].replace(r'[\d]','',regex=True, inplace=True) # lower case all words resume_df["Resume Text"] = resume_df["Resume Text"].str.lower() job_df["Job Description"] = job_df["Job Description"].str.lower() # remove stopwords STOP_WORDS.add('') # try_df["Resume Text"] = resume_df["Resume Text"].apply(lambda x: [str(word) for word in x if word not in STOP_WORDS]) # try_df["Job Description"] = job_df["Job Description"].apply(lambda x: [word for word in x if word not in STOP_WORDS]) resume_df.to_csv('data/cleaned_resume.csv', index=False) job_df.to_csv('data/cleaned_job.csv', index=False)
import os import re from unidecode import unidecode import numpy as np import json import sys import logging from numpy.linalg import norm from gensim.test.utils import datapath from gensim.models.fasttext import load_facebook_model from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add('de_l_la_le_di') logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]) logger = logging.getLogger() class MemoryGenerator(): def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path): logger.info("Initializing Memory Generator ....") self.conv2kg = conv2kg self.kgs = kgs self.mapping = json.load(open("data/" + dataset + "/ERmapping.json")) self.maxEntity, self.maxRel = self.read_dataset(dataset) logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " + str(self.maxRel)) self.matrix_dim = self.maxEntity + self.maxRel self.word_emb = load_facebook_model( datapath(os.getcwd() + "/" + fasttext_emb_path))
#Add & Remove a new Stop Word import nltk STOP_WORDS = nltk.corpus.stopwords.words('english') STOP_WORDS.append('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import nltk STOP_WORDS.remove('Test') print(len(STOP_WORDS)) print(STOP_WORDS) import spacy from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add("Test") print(len(STOP_WORDS)) print(STOP_WORDS) import spacy from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.remove("Test") print(len(STOP_WORDS)) print(STOP_WORDS)
import os import re import spacy from spacy.lang.en.stop_words import STOP_WORDS from typing import Text, List from utils import deprecated PathType = str # Load the spacy english model nlp = spacy.load("en") STOP_WORDS.add("-PRON-") STOP_WORDS.add("~sil") @deprecated def remove_stopwords(text: Text ) -> Text: # This function removes stopwords from a list of strings # Parameter: 'list_of_tokens': a list of strings # return: 'list_of_tokens' without the stopwords list_of_tokens = re.split(r"\s", text) assert type(list_of_tokens) is list, "list_of_tokens must be of type list" doc = " ".join([t for t in list_of_tokens if t not in STOP_WORDS]) return re.sub(r' +', r' ', doc) @deprecated def lemmatize(text: Text
def add_sw(new_sw): STOP_WORDS.add(new_sw) return