def debate_text_process(text): from nltk.tokenize import word_tokenize tokens = word_tokenize(str(text)) # convert to lower case tokens = [w.lower() for w in tokens] # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words from nltk.corpus import stopwords from spacy.lang.en.stop_words import STOP_WORDS stop_words = set(stopwords.words('english')) STOP_WORDS.update(stop_words) STOP_WORDS.update({ 'nt', 'okay', 'ha', 'thank', 'wa', 'got', 'oh', 'said', 'going', 'want', 'let', 'know' }) words = [w for w in words if not w in STOP_WORDS] #print(len(STOP_WORDS)) from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() words = [wordnet_lemmatizer.lemmatize(w) for w in words] return words
def load_custom_stoplist(self, stoplist_file): """Load custom stoplist.""" with open(stoplist_file, 'r') as f: stoplist = f.read().split('\n') for item in stoplist: STOP_WORDS.add(item) self.nlp.vocab[item].is_stop = True
def _collect_words(self): """Collects all the unique word and pos_tag pairs from the text.""" nlp = spacy.load("en_core_web_lg") # coref = NeuralCoref(nlp.vocab) # nlp.add_pipe(coref, name='neuralcoref') print("Preparing Spacy object") nlp.max_length = len(self.text) text_obj = nlp(str(self.text.lower()), disable=['NER']) print("Preparing Spacy object") # Resolve co-reference using neuralcoref # self.text = text_obj._.coref_resolved # nlp.remove_pipe("neuralcoref") # text_obj = nlp(str(self.text.lower()), disable=['NER']) prev_sent = Sentence(nlp(''), None) words = {} STOP_WORDS.add('_') logging.info("Collecting words") for sent in tqdm(text_obj.sents): # sent = nlp(Sentence.clean_sentence(sent.text)) curr_sent = Sentence(sent, prev_sent) for token in sent: if token.text in STOP_WORDS or\ token.pos_ in ['PART', 'PUNCT', 'SPACE', 'NUM', 'SYM']: continue key = token.text.strip() + ' ; ' + token.tag_ if key not in words: words[key] = Word(token) words[key].include_sentence(curr_sent) return words
def spacy_adder(self, model, verbose=False): for stopword in self.vocab_list: STOP_WORDS.add(stopword) model.vocab.add_flag( lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP) if verbose: print( f"Complete. There are {len(self.vocab_list)} stop words in the list." )
def construct_stop_words(): """ Update the spacy stopwords list :return: """ stop_words_list = [ "uk", "ceo", "apple", "wal", "st", "q1", "q2", "q3", "q4", "bp", "wednesday", "tuesday", "monday", "thursday", "friday", "sept", "johnson", "inc", "david", "amazon.com" ] for words in stop_words_list: STOP_WORDS.add(words) return STOP_WORDS
def words_stop(): words_stop._log.debug("\nThe outcomes of words stop are:") from spacy.lang.en.stop_words import STOP_WORDS # print (STOP_WORDS) STOP_WORDS.add("your_additional_stop_word_here") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"} # 单个词可以直接.add() nlp.Defaults.stop_words -= {"嗯"} # 单个词可以直接.remove() for word in nlp.Defaults.stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True words_stop._log.debug(nlp.Defaults.stop_words)
def summarization(): with open("./stories/d3370f0d60746aebcc5f61a068805b8545357e6f.story", "r", encoding="utf-8") as f: text = " ".join(f.readlines()) core = en_core_web_sm.load() doc = core(text) # clean sentences corpus = [sent.text.lower() for sent in doc.sents] STOP_WORDS.add("@highlight") cv = CountVectorizer(stop_words=list(STOP_WORDS)) cv_fit = cv.fit_transform(corpus) word_list = cv.get_feature_names() count_list = cv_fit.toarray().sum(axis=0) # zip it in a way that pair word and the its count word_frequency = dict(zip(word_list, count_list)) words_freqs = sorted(word_frequency.values()) higher_word_frequencies = [word for word, freq in word_frequency.items() if freq in words_freqs[-3:]] print("higher frequency words : ", higher_word_frequencies) higher_frequency = words_freqs[-1] # normalise the frequencies values for word in word_frequency.keys(): word_frequency[word] = (word_frequency[word]/higher_frequency) sentence_rank = {} for sent in doc.sents: for word in sent: if word.text.lower() in word_frequency.keys(): if sent in sentence_rank.keys(): sentence_rank[sent] += word_frequency[word.text.lower()] else: sentence_rank[sent] = word_frequency[word.text.lower()] else: continue # fetch top sentences which have the higher top-freq words top_sentences = (sorted(sentence_rank.values())[::-1]) top_sent = top_sentences[:3] summary = [] for sent, strength in sentence_rank.items(): if strength in top_sent: summary.append(sent) return text, summary
def clean_text(document): stop_words_ = STOP_WORDS.union(stopwords.words('english')) stop_words = [unidecode(stop).lower() for stop in stop_words_] # Split to translate tokens = document.split() # Concatenate document = ' '.join(tokens) # Remove accents document = unidecode(document) # Remove https, mentions, special characters, single character document = re.sub( "(@[A-Za-z0-9]+)|(_[A-Za-z0-9]+)|(\w+:\/\/\S+)|(\W_)", " ", document).lower() # Remove pontuaction document = re.sub('[' + string.punctuation + ']', '', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove digits document = ''.join([i for i in document if not i.isdigit()]) # Remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Split tokens = document.split() # Stopwords tokens = [w for w in tokens if w not in stop_words] # Concatenate preprocessed_text = ' '.join(tokens) return preprocessed_text
def Stop(): print("\nThe outcomes of Stop Words are:") from spacy.lang.en.stop_words import STOP_WORDS # print (STOP_WORDS) STOP_WORDS.add("your_additional_stop_word_here") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True # print (lexeme.text) nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"} # 单个词可以直接.add() nlp.Defaults.stop_words -= {"嗯"} # 单个词可以直接.remove() for word in nlp.Defaults.stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True # print (lexeme.text) print(nlp.Defaults.stop_words)
def _set_stopwords(self) -> 'KeywordRanking': stop_words = STOP_WORDS.union(self.stopwords) if self.stopwords else STOP_WORDS for word in stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True return self
def cluster_commonwords(texts, nwords=10, onlycorona="yes"): ignore_words = STOP_WORDS if onlycorona == "no" else STOP_WORDS.union( ['coronavirus', 'covid', 'covid19', 'covid-19']) allwords = [ w for w in ' '.join(texts.str.lower()).split() if w not in ignore_words and re.search('[a-z]', w) ] return ', '.join( [word for word, cnt in Counter(allwords).most_common(nwords)])
def set_stopwords(self, stopwords): """Set stop words""" if self.language == "en": for word in STOP_WORDS.union(set(stopwords)): lexeme = self.nlp.vocab[word] lexeme.is_stop = True elif self.language == "de": for word in STOP_WORDS_DE.union(set(stopwords)): lexeme = self.nlp.vocab[word] lexeme.is_stop = True
def transform_matrix(content_body, tokenize_lemma): html_stop_words = get_html_stop_words(content_body, tokenize_lemma) stop_words_lemma_train = set( tokenize_lemma(' '.join(STOP_WORDS.union(set(html_stop_words))))) X = content_body tfidf_vectorizer = TfidfVectorizer(max_features=300, stop_words=stop_words_lemma_train, tokenizer=tokenize_lemma) tfidf_vectorizer = tfidf_vectorizer.fit(X) tfidf_matrix = tfidf_vectorizer.transform(X) return tfidf_matrix
def remove_stopwords(content): custom_stopwords = ("feeling", "feel", "becaus", "want", "time", "realli", "im", "think", "thing", "ive", "still", "littl", "one", "life", "peopl", "need", "bit", "even", "much", "dont", "look", "way", "love", "start", "s", "m", "quot", "work", "get", "http", "go", "day", "com", "got", "see" "4pm", "<BIAS>", "veri", "know", "t", "like", "someth", "good", "going", "today", "u", "new", "cant", "people", "little", "pretty", "things") return hero.remove_stopwords(content, spacy_stop_words.union(custom_stopwords))
def preprocess_text(author_df): nlp = spacy.load('en') STOP_WORDS.add("'s") STOP_WORDS.add('the') STOP_WORDS.add('a') for word in STOP_WORDS: nlp.vocab[word].is_stop = True doc = author_df.text.apply(nlp) # remove stop words and punctuations clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop]) author_df['text_cleaned'] = doc.apply(clean_and_lemmatize) # enteties author_df['text_with_entities'] = doc.apply(replace_ents) # pos-tag pairs author_df['text_pos_tag_pairs'] = author_df['text'].apply(lambda row: pos_tag_pairs_sentence(row)) # additional nlp meta features author_df['polarity_of_text'] = author_df['text'].apply(lambda row: get_polarity(row)) author_df['punct_cnt'] = doc.apply(lambda x: len([t for t in x if t.is_punct])) author_df['words_cnt'] = doc.apply(lambda x: len([t for t in x if not t.is_punct])) author_df['ents_cnt'] = doc.apply(lambda x: len(x.ents)) author_df['noun_chunks_cnt'] = doc.apply(lambda x: len(list(x.noun_chunks))) author_df['fraction_noun'] = author_df['text'].apply(lambda row: fraction_noun(row)) author_df['fraction_adj'] = author_df['text'].apply(lambda row: fraction_adj(row)) author_df['fraction_verbs'] = author_df['text'].apply(lambda row: fraction_verbs(row)) return author_df
def preprocess(texts): texts = str(texts) texts = texts.lower() texts = re.sub(r"(http|@)\S+", " ", texts) texts = demojize(texts) texts = re.sub(r"’", "'", texts) texts = re.sub("n't", "n not", texts) texts = re.sub("'ll", " will", texts) texts = re.sub("'ve", " have", texts) texts = re.sub(r"[^a-z\':_]", " ", texts) texts = re.sub(r"[0-9]+", " ", texts) texts = re.sub("re-[a-z]+", " ", texts) pattern = re.compile(r"(.)\1{2,}", re.DOTALL) texts = re.sub(pattern, r"\1", texts) tokens = tokenizer(texts) try: STOP_WORDS.remove('not') STOP_WORDS.remove('nor') STOP_WORDS.remove('no') except: pass lemma_list = [] for token in tokens: if token not in STOP_WORDS: lemma_list.append(token.lemma_) texts = ' '.join(map(str, lemma_list)) pred_vect = vectorizer.transform([texts]) texts = label.classes_[model.predict(pred_vect)] texts = ' '.join(map(str, texts)) return texts
def scripts_to_tfidf(scripts): """Create Tfidf matrix from tokenized scripts.""" # custom stop words for scripts film_stop_words = ['V.O.', "Scene", "CUT TO", "FADE IN"] stop_words = STOP_WORDS.union(film_stop_words) # vectorize scripts into Tfidf matrix vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, min_df=0.2, ngram_range=(1, 2)) # less than 20% frequency words are removed. bow = vectorizer.fit_transform(scripts) vocab = vectorizer.get_feature_names() return bow, vocab
def detectTextIn(self, Text): classFromText = [] classFromText.append(Text) # Text=Text.lower() nlp = spacy.load('en_core_web_sm') # Adding Custom stop words STOP_WORDS.add("picture") STOP_WORDS.add("image") STOP_WORDS.add("images") STOP_WORDS.add("pics") STOP_WORDS.add("portrait") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True uni_string = str(Text) doc = nlp(uni_string) for ent in doc.ents: classFromText.append(ent.label_) Text = Text.lower() uni_string = str(Text) doc = nlp(uni_string) for token in doc: # """token.text, token.lemma_, token.pos_, token.tag_, token.dep_, # token.shape_, token.is_alpha, token.is_stop""" if not token.is_stop: classFromText.append(token.lemma_) classFromText.append(token.text) classFromText = [a.lower() for a in classFromText] for text in classFromText: if text == "": classFromText.remove(text) classFromText = set(classFromText) return classFromText
def stage2(process_folder, label): path_stage1 = process_folder + label + 'stage1.json' from spacy.lang.en.stop_words import STOP_WORDS path_stage2 = process_folder + label + 'stage2.json' graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in normalize_key_phrases( path_stage1, ranks, stopwords=STOP_WORDS.union(STOP_WORDS)): f.write("%s\n" % pretty_print(rl._asdict()))
def prepare_stopwords(): NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"] stopwords = STOP_WORDS.copy() for word in STOP_WORDS: if word in NEGATE: stopwords.remove(word) return stopwords
def get_baseline(): print('Spacy:', len(STOP_WORDS)) sw_sk = set(stop_words.ENGLISH_STOP_WORDS) print('sklearn', len(sw_sk)) sw = set( pd.read_csv( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words', header=None, squeeze=True).tolist()) print('web', len(sw)) all = STOP_WORDS.union(sw).union(sw_sk) print('all', len(all)) pd.Series(sorted(list(all))).to_csv('baseline.csv', index=False)
import numpy as np # from sklearn.externals.six import StringIO # from sklearn.tree import export_graphviz # import pydotplus # from IPython.display import Image from spellchecker import SpellChecker import pickle #using smote to deal imbalance #from imblearn.over_sampling import SMOTE # this symbol seems to have higher weightage in the final words when Naive Bayes is used, # so adding it to punctuations to filter punctuations = string.punctuation + "".join( ["...", "..........", "....", "--", "/"]) nlp = spacy.load("en_core_web_sm") STOP_WORDS = STOP_WORDS.union(CUSTOM_STOP_WORDS) #excluding NO from stopwords for our use #STOP_WORDS.discard("no") #STOP_WORDS.discard("not") #STOP_WORDS.discard("off") Urban_vocab = pd.read_csv("urbandict-word-def.csv") Urban_vocab = Urban_vocab["WORD"].tolist() # contraction_log = open("1_contractions.log", "w") # slang_log = open("1_slang.log", "w") out_vocab = open("1_o_vocab.log", "w") corpus_vocab = open("1_corpus_vocab.log", "w") parser = English() p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.SMILEY, p.OPT.NUMBER,
def set_stopwords(self, stopwords): """Set stop words""" for word in STOP_WORDS.union(set(stopwords)): lexeme = nlp.vocab[word] lexeme.is_stop = True
def queryTokens(cadena, languages): cadena = __preprocessString(cadena) # remove non ascii-characters cadena = ''.join(i for i in cadena if ord(i) < 128) cadena = cadena.strip() # remove initial and end spaces word_tokens = word_tokenize(cadena) # Detect in which language the text is written lang = detect_language(word_tokens, languages) stop_words = set(stopwords.words(lang)) # Filtering stop words inverters = set([ 'dont', 'doesnt', 'havent', 'arent', 'didnt', 'wasnt', 'werent', 'not', 'never', 'hardly', 'seldom' ]) incrementers = set(['too', 'many', 'much', 'very', 'lots']) STOP_WORDS.add('im') STOP_WORDS.add('pm') STOP_WORDS.add('ai') STOP_WORDS.add('ie') STOP_WORDS.add('still') STOP_WORDS.add('cant') STOP_WORDS.add('isnt') STOP_WORDS.add('couldnt') STOP_WORDS.add('youre') STOP_WORDS.add('seen') STOP_WORDS.add('say') STOP_WORDS.add('says') STOP_WORDS.add('tell') STOP_WORDS.add('lot') STOP_WORDS.add('lol') STOP_WORDS.add('hes') STOP_WORDS.add('s') STOP_WORDS.add('be') filtered_sentence = [ w for w in word_tokens if not w in stop_words and not w in inverters and not w in incrementers and not w in STOP_WORDS ] # Checking not in stop_words return filtered_sentence
import os import re from unidecode import unidecode import numpy as np import json import sys import logging from numpy.linalg import norm from gensim.test.utils import datapath from gensim.models.fasttext import load_facebook_model from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS.add('de_l_la_le_di') logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)-5.5s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]) logger = logging.getLogger() class MemoryGenerator(): def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path): logger.info("Initializing Memory Generator ....") self.conv2kg = conv2kg self.kgs = kgs self.mapping = json.load(open("data/" + dataset + "/ERmapping.json")) self.maxEntity, self.maxRel = self.read_dataset(dataset) logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " + str(self.maxRel)) self.matrix_dim = self.maxEntity + self.maxRel self.word_emb = load_facebook_model( datapath(os.getcwd() + "/" + fasttext_emb_path))
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
import hashlib from pytorch_pretrained_bert import BertTokenizer import string import spacy from spacy.lang.en.stop_words import STOP_WORDS import string nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger']) STOP_WORDS.update(string.punctuation) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def bert_tokenization_length(context, question, reference, candidate): context_len = len(tokenizer.tokenize(context)) question_len = len(tokenizer.tokenize(question)) candidate_len = len(tokenizer.tokenize(candidate)) reference_len = len(tokenizer.tokenize(reference)) return max(context_len + question_len + candidate_len, context_len + question_len + reference_len) def check_data_and_return_hash(context, question, reference, candidate): assert type(context) == type(question) == type(reference) == type( candidate) == str if context == '' or question == '' or reference == '' or candidate == '': return None sample = context + question + reference + candidate hash_object = hashlib.md5(sample.encode())
from gensim.corpora import Dictionary from gensim.models import TfidfModel import re import string import spacy from spacy.lang.en.stop_words import STOP_WORDS import os import json from pathlib import Path nlp = spacy.load('en_core_web_lg') with open('stopwords.txt', 'r', encoding='utf-8') as f: STOPWORDS = f.readlines() STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS]) STOP_WORDS = STOP_WORDS.union(STOPWORDS) # encodings: replace_dict = { '\ufb01': 'fi', '\u2019': '', '\u00e9': 'e', '\u00a8': '', 'ямБ': 'fi', } documents = [] # [ [token, token, token], [token, token, token], ...] fp = '../data/LRECjson/' for jsonfile in os.listdir(Path(fp)): #for jsonfile in ['../data/LRECjson/2018_1049.json']:
Windows:python -m spacy download en as Administrator Linux:sudo python -m spacy download en """ nlp = spacy.load('en') """#Exploring spaCy""" from spacy.lang.en.stop_words import STOP_WORDS STOP_WORDS f'There are {len(STOP_WORDS)} stopwords in spaCy' # You can add your own corpora specific STOPWORDS using the .add syntax STOP_WORDS.add("your_additional_stop_word_here") f'After adding your own stop words, spaCy will use {len(STOP_WORDS)} stopwords' doc = nlp("I am learning the most important ideas Natural Language Processing ideas using Python") print(doc) # doc is a spaCy object which stores the entire document string """**About spaCy objects**""" for token in doc: print(token) simplified_doc = [token for token in doc if not token.is_punct | token.is_stop] simplified_doc # please note that .orth_ attribute returns the unicode string representation of the token """We can also check what other things we know about these tags in the simplified_doc:"""
import pickle import spacy import re from spacy.lang.en.stop_words import STOP_WORDS nlp = spacy.load('en', disable=['parser']) CLASSIFIER_ROOT = 'classifiers/' TRANSFORMERS = ['transform_bag_of_words_0.sav', 'transform_bag_of_words_1.sav'] MODELS = ['nb.sav'] STOP_WORDS.add("'s") for word in STOP_WORDS: lexeme = nlp.vocab[word] lexeme.is_stop = True def load_model(model_name): with open('{0}{1}'.format(CLASSIFIER_ROOT, model_name), 'rb') as f: model = pickle.load(f) return model CLF_NB = load_model(MODELS[0]) TRANSFORMERS_MODELS = [load_model(TRANSFORMERS[0]), load_model(TRANSFORMERS[1])] def clean_html(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext.lower()