class WordTokenizer(object): def __init__(self, stemmer='porter'): self.stemmer = stemmer if stemmer == 'wordnet': self.wnl = WordNetLemmatizer() if stemmer == 'porter': self.wnl = PorterStemmer() if stemmer == 'snowball': self.wnl = SnowballStemmer('english') def __call__(self, doc): if self.stemmer == 'wordnet': return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] else: return [self.wnl.stem(t) for t in word_tokenize(doc)]
class Normalizer: def __init__(self, norm: str, language: str): self.norm = norm self.language = language if self.norm == "lemmatization": # PyMystem3 не поддерживает английский # NLTK.WordNetLemmatizer не поддерживает русский if self.language == "ru": self.alg = Mystem() elif self.language == "en": self.alg = WordNetLemmatizer() # Стемминг elif self.norm == "stemming": self.alg = SnowballStemmer(expand_language(self.language)) else: raise ValueError( "{} is not supported. " "Available options: 'lemmatization, 'stemming'".format( self.norm)) def normalize(self, text: str, return_list=False): res = None token_list = None # Лемматизация if self.norm == "lemmatization": # PyMystem3 не поддерживает английский # NLTK.WordNetLemmatizer не поддерживает русский if self.language == "ru": token_list = self.alg.lemmatize(text) elif self.language == "en": token_list = list(map(self.alg.lemmatize, text.split())) # Стемминг elif self.norm == "stemming": token_list = list(map(self.alg.stem, text.split())) # Выбор формата результата if not return_list: res = " ".join(remove_empty_items(token_list)) else: res = token_list return res
class Lemmatizer(PreprocessingStep): COLUMN_LEMMA = "lemma" """ spacy lemma file precalculation example: Lemmatizer().precalculate_spacy_english_lemmatizer([Etour(), Itrust()]) """ class LemmatizerType(Enum): english_nltk = 1 english_spacy = 2 italian_nltk = 3 # is a stemmer, nltk does not have an italian lemmatizer italian_spacy = 4 def __init__(self, lemmatizer_type=LemmatizerType.english_nltk): self._lemmatizer_type = lemmatizer_type self._lemmatizer = None if lemmatizer_type == self.LemmatizerType.english_nltk: self._lemmatizer = WordNetLemmatizer() elif lemmatizer_type == self.LemmatizerType.english_spacy: # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once if not FileUtil.file_exists(PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV): log.error( f"{PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file." ) self._lemmatizer = PandasUtil.read_csv_to_dataframe( PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV) elif lemmatizer_type == self.LemmatizerType.italian_nltk: self._lemmatizer = SnowballStemmer("italian") elif lemmatizer_type == self.LemmatizerType.italian_spacy: # Use precalculated files for spacy since free google colab can't handle fasttext model and spacy lemmatizer at once if not FileUtil.file_exists(PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV): log.error( f"{PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV} does not exists. The spacy lemmatizer needs an precalculated lemma file." ) self._lemmatizer = PandasUtil.read_csv_to_dataframe( PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV) else: log.error(f"Unknown case for LemmatizerType: {lemmatizer_type}") def execute(self, text_tokens): if self._lemmatizer_type == self.LemmatizerType.english_nltk: return [self._lemmatizer.lemmatize(token) for token in text_tokens] elif self._lemmatizer_type == self.LemmatizerType.english_spacy or self._lemmatizer_type == self.LemmatizerType.italian_spacy: return [ self._lemmatizer.at[token, self.COLUMN_LEMMA] if token in self._lemmatizer.index else token for token in text_tokens ] if self._lemmatizer_type == self.LemmatizerType.italian_nltk: return [self._lemmatizer.stem(token) for token in text_tokens] @classmethod def _precalculate_spacy_lemmatizer(cls, spacy_lemmatizer, datasets, output_path): dataset_tuples = [] for dataset in datasets: req_tokenizer = WordTokenizer(dataset, not dataset.is_english()) req_pre = Preprocessor([ UrlRemover(), Separator(), NonLetterFilter(), CamelCaseSplitter(), LowerCaseTransformer() ]) code_tokenizer = JavaCodeASTTokenizer( dataset, WordTokenizer(dataset, not dataset.is_english())) code_pre = Preprocessor([ UrlRemover(), Separator(), NonLetterFilter(), CamelCaseSplitter(), JavaCodeStopWordRemover(not dataset.is_english()), LowerCaseTransformer() ]) dataset_tuples.append( (dataset, code_pre, code_tokenizer, req_pre, req_tokenizer)) word_to_lemma_map = {} def iterate_files(tokenizer, preprecessor, folder): for file in FileUtil.get_files_in_directory(folder, True): file_representation = tokenizer.tokenize(file) file_representation.preprocess(preprecessor) for word in file_representation.token_list: lemma = [token.lemma_ for token in spacy_lemmatizer(word)] if len(lemma) > 1: log.info( f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma" ) lemma = "".join(lemma) if word in word_to_lemma_map: if not word_to_lemma_map[word] == lemma: log.info( f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}" ) else: word_to_lemma_map[word] = lemma for dataset, code_pre, code_tok, req_pre, req_tok in dataset_tuples: iterate_files(req_tok, req_pre, dataset.req_folder()) iterate_files(code_tok, code_pre, dataset.code_folder()) word_to_lemma_dataframe = pandas.DataFrame.from_dict( word_to_lemma_map, orient="index", columns=[cls.COLUMN_LEMMA]) PandasUtil.write_dataframe_to_csv(word_to_lemma_dataframe, output_path) @classmethod def precalculate_spacy_english_lemmatizer(cls, datasets): cls._precalculate_spacy_lemmatizer( en_core_web_trf.load(disable=['ner', 'parser']), datasets, PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV ) # we only need the lemmatizer component, disable the other @classmethod def precalculate_spacy_italian_lemmatizer(cls, datasets): cls._precalculate_spacy_lemmatizer( it_core_news_lg.load(disable=['ner', 'parser']), datasets, PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
class NLP(): def __init__(self, remove_stopwords=True, replace_words=True, remove_numbers=True, remove_html_tags=True, remove_punctuations=True, lemmatize=False, lemmatize_method='wordnet'): """ This package contains functions that can help during the preprocessing of text data. :param remove_stopwords: boolean default value = True :param replace_words: boolean default value = True """ if (type(remove_stopwords) != bool or type(replace_words) != bool or type(remove_numbers) != bool or type(remove_html_tags) != bool or type(remove_punctuations) != bool or type(lemmatize) != bool): raise Exception("Error - expecting a boolean parameter") if lemmatize_method not in ['wordnet', 'snowball']: raise Exception("Error - lemmatizer method not supported") self.doc = None self.lemmatizer = None self.remove_stopwords = remove_stopwords self.replace_words = replace_words self.remove_numbers = remove_numbers self.remove_html_tags = remove_html_tags self.remove_punctations = remove_punctuations self.lemmatize_method = lemmatize_method self.lemmatize = lemmatize self.stopword_list = set(stopwords) self.replacement_list = to_replace if self.lemmatize_method == 'wordnet': self.lemmatizer = WordNetLemmatizer() if self.lemmatize_method == 'snowball': self.lemmatizer = SnowballStemmer('english') def remove_stopwords_fun(self): """ This function removes stopwords from doc. It works by tokenizing the doc and then checking if the word is present in stopwords """ tokens = str(self.doc).split() cleaned_tokens = [token for token in tokens if token.lower() not in self.stopword_list] self.doc = ' '.join(cleaned_tokens) def replace_words_fun(self): """ This function replaces words that are -- by checking a word if a word is present in a dictionary if the word is present in dictionary then it is replaced with its value from dictionary """ cleaned_doc = [] for word in str(self.doc).split(): if word.lower() in self.replacement_list.keys(): cleaned_doc.append(self.replacement_list[word.lower()]) else: cleaned_doc.append(word) self.doc = ' '.join(cleaned_doc) def remove_numbers_fun(self): """ This function uses regex to remve all the numbers from the doc. """ self.doc = re.sub("[0-9]", "", self.doc) def remove_html_tags_fun(self): """ This function uses regex's complile method to remove all the HTML tags from the doc """ cleaner = re.compile('<.*?>') cleaned_text = re.sub(cleaner, '', self.doc) cleaned_text = re.sub('[\n\t]', '', cleaned_text) self.doc = cleaned_text def remove_punctations_fun(self): """ This function uses regex to remove alk the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc) def lemmatize_fun(self): """ This function applies the stemming to the words It can be operated with either WordNetLemmatizer or Snowball Stemmer --------------------------- Example: lemmatize(method='snowball') default value = 'wordnet """ tokens = str(self.doc).split() cleaned_tokens = None if self.lemmatize_method == 'wordnet': cleaned_tokens = [self.lemmatizer.lemmatize(token) for token in tokens] else: cleaned_tokens = [self.lemmatizer.stem(token) for token in tokens] self.doc = ' '.join(cleaned_tokens) def add_stopword(self, *args): """ This function is used to add new stopwords to the predefined list Parameters - ["new_stopword"] ------------------------------ Example - obj = NLP() obj.add_stopword(["first_word", "second_word"]) """ if self.remove_stopwords is False: raise Exception("Please enable removal of stopwords") if type(args) != list: raise Exception("Error - pass stopwords in list") for arg in args: self.stopword_list.add(arg) def add_replacement(self, *args): """ This function is used to add new replacement words to the predifined list Parameters - [ = ""] ---------------------------- Example - obj = NLP() obj.add_replacement([first: "replacement1", second: "replacement2"]) """ if self.replace_words is False: raise Exception("Please enable cleaning of stopwords") if type(args) != list: raise Exception("Error - pass input parameters in list") if args == []: raise Exception("Error - list is empty") try: for key, value in args.items(): self.replacement_list[key] = value except: print("Expected args in dict format") def remove_stopwords(self, *args): """ This function is used to remove stopwords from predefined list Parameters - ["first_word"] ------------------------------ Example obj = NLP() obj.remove_stopwords(['new_stopword_here']) """ if self.remove_stopwords is False: raise Exception("Error - enable stopword removal functionality") if type(args) != list: raise Exception("Error - expected a list") if args == []: raise Exception("Error - no items to remove from stopword list") for arg in args: if arg in self.stopword_list: self.stopword_list.remove(arg) else: raise Exception(arg+" not in list") def print_stopwords(self): """ This function prints all the stopwords that are present in the list Return Type - list ------------------------------ Example obj = NLP() obj.print_stopwords() """ if self.stopword_list == []: raise Exception("Error - stopword list is empty") print(self.stopword_list) def process(self, doc): """ This function processes the doc If the remove_stopwords flag is True - it will remove stopwords from doc If the clean_words flag is True - it will clean the doc by replacing words Parameters - [doc] ------------------------------ Example obj = NLP() obj.process(["process this text"]) How to use with pandas? obj = NLP() df = df['text].apply(obj.process) """ self.doc = doc if self.replace_words is True: self.replace_words_fun() if self.remove_html_tags is True: self.remove_html_tags_fun() if self.remove_stopwords is True: self.remove_stopwords_fun() if self.remove_numbers is True: self.remove_numbers_fun() if self.remove_punctations is True: self.remove_punctations_fun() if self.lemmatize is True: self.lemmatize_fun() return self.doc
class PreProcessor(): def __init__(self, file_path=None, doc_link=None, folder_link=None, remove_stopwords=True, lower=True, tokenize_word=True, contraction_method='mapping', remove_numbers=True, remove_html_tags=True, remove_punctuations=True, remove_accented_chars=True, remove_whitespace=True, lemmatize_method='wordnet', embedding_method='word2vec', auto_correct=True): """ This package contains functions that can help during the preprocessing of text data. :param remove_stopwords: boolean default value = True :param replace_words: str default value = regex """ if (type(remove_stopwords) != bool or type(lower) != bool or type(tokenize_word) != bool or # type(tokenize_sent) != bool or type(remove_numbers) != bool or type(remove_html_tags) != bool or type(remove_punctuations) != bool or type(remove_accented_chars) != bool or type(auto_correct) != bool or type(remove_whitespace) != bool): raise Exception("Error - expecting a boolean parameter") if lemmatize_method not in ['wordnet', 'snowball']: raise Exception("Error - lemmatizer method not supported") else: self.lemmatize = True if contraction_method not in ['glove', 'word2vec', 'mapping']: raise Exception("Error - contraction method not supported") else: self.contractions = True if embedding_method not in ['glove', 'word2vec', 'bow']: raise Exception("Error - embedding method not supported") else: self.word_embedding = True if file_path == None and doc_link == None and folder_link == None: raise Exception("Error - expecting the file path") self.doc = None self.sents = None self.tweets = None self.lemmatizer = None self.file_path = file_path self.doc_link = doc_link self.folder_link = folder_link self.lower = lower self.remove_stopwords = remove_stopwords self.contraction_method = contraction_method self.embedding_method = embedding_method self.remove_numbers = remove_numbers self.remove_html_tags = remove_html_tags self.remove_punctations = remove_punctuations self.remove_accented_chars = remove_accented_chars self.remove_whitespace = remove_whitespace self.lemmatize_method = lemmatize_method self.stopword_list = stopwords.words('english') self.replacement_list = to_replace self.tokenize_word = tokenize_word # self.tokenize_sent = tokenize_sent self.auto_correct = auto_correct if self.lemmatize_method == 'wordnet': self.lemmatizer = WordNetLemmatizer() if self.lemmatize_method == 'snowball': self.lemmatizer = SnowballStemmer('english') def file_reader(self): file_content = prepare_text(self.file_path, dolower=False) return file_content def doc_downloader(self, document_link, document_type, document_name): # Extracting the ID from the given link pattern = r"(?<=d/)(.+)(?=/)" DOCUMENT_ID = re.findall(pattern, document_link)[0] print(f"DOCUMENT ID: {DOCUMENT_ID}") # Specifying the format in which the document will be downloaded if document_type.lower() in ['docx', "doc"]: file_format = "docx" elif document_type.lower() in ['pdf']: file_format = "pdf" else: print( "Document Format Not Supported. Only Docs, Doc and PDF are supported" ) return None creds = None if os.path.exists(token_file): with open(token_file, 'rb') as token: creds = pickle.load(token) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( credentials_json, SCOPES) creds = flow.run_local_server(port=0) with open(token_file, 'wb') as token: pickle.dump(creds, token) service = build('drive', 'v3', credentials=creds) file_name = '.'.join([document_name, file_format]) try: print("Downloading file") request = service.files().get_media(fileId=DOCUMENT_ID) fh = io.BytesIO() downloader = MediaIoBaseDownload(fd=fh, request=request) done = False while done is False: status, done = downloader.next_chunk() print(f"Download {status.progress()*100}") except: print("Downloading MS Word Document file") request = service.files().export_media( fileId=DOCUMENT_ID, mimeType= 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) fh = io.BytesIO() downloader = MediaIoBaseDownload(fd=fh, request=request) done = False while done is False: status, done = downloader.next_chunk() print(f"Download {status.progress()*100}") fh.seek(0) with open(os.path.join(file_storage, file_name), 'wb') as f: f.write(fh.read()) f.close() print("SAVED") def folder_downloader(self, folder_link): # Extracting the ID from the given link pattern = r'(?<=folders/)(\w+)' DOCUMENT_ID = re.findall(pattern, folder_link)[0] print(f"DOCUMENT ID: {DOCUMENT_ID}") creds = None if os.path.exists(token_file): with open(token_file, 'rb') as token: creds = pickle.load(token) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( credentials_json, SCOPES) creds = flow.run_local_server(port=0) with open(token_file, 'wb') as token: pickle.dump(creds, token) service = build('drive', 'v3', credentials=creds) listofFiles = [] page_token = None # docx_query = f"'{DOCUMENT_ID}' in parents and mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document'" # pdf_query = f"'{DOCUMENT_ID}' in parents and mimeType='application/pdf'" # txt_query = f"'{DOCUMENT_ID}' in parents and mimeType='text/plain'" query = f"'{DOCUMENT_ID}' in parents" while True: response = service.files().list( q=query, fields='nextPageToken, files(id, name)', pageToken=page_token, includeItemsFromAllDrives=True, supportsAllDrives=True).execute() for file in response.get('files', []): listofFiles.append(file) page_token = response.get('nextPageToken', None) if page_token is None: break for item in listofFiles: document_id = item['id'] file_name = item['name'] name_splitted = file_name.split(".") if len(name_splitted) == 1: file_name = '.'.join([file_name, "docx"]) try: print("Downloading docx file") print(file_name) request = service.files().get_media(fileId=document_id) fh = io.BytesIO() downloader = MediaIoBaseDownload(fd=fh, request=request) done = False while done is False: status, done = downloader.next_chunk() print(f"Download {status.progress()*100}") except: print("Downloading doc file") print(file_name) request = service.files().export_media( fileId=document_id, mimeType= 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ) fh = io.BytesIO() downloader = MediaIoBaseDownload(fd=fh, request=request) done = False while done is False: status, done = downloader.next_chunk() print(f"Download {status.progress()*100}") fh.seek(0) with open(file_storage + '/' + file_name, 'wb') as f: f.write(fh.read()) f.close() def lower_fun(self): """ This function converts text to lower """ self.doc = self.doc.lower() def remove_stopwords_fun(self): """ This function removes stopwords from doc. It works by tokenizing the doc and then checking if the word is present in stopwords """ # tokens = str(self.doc).split() tokens = word_tokenize(self.doc) cleaned_tokens = [ token for token in tokens if token.lower() not in self.stopword_list ] self.doc = ' '.join(cleaned_tokens) def word_embedding_fun(self): # if(self.tokenize_sent==False): # self.doc = sent_tokenize(self.doc) if (self.tokenize_word == False): self.tokenize_word_fun() if self.embedding_method == 'glove': model = api.load("glove-twitter-25") vecs = [] for x in self.doc: vec = [model[i] for i in x] vecs.append(vec) self.doc = vecs # print(vecs) elif self.embedding_method == 'word2vec': pass elif self.embedding_method == 'bow': pass def mapping_decontraction(self, phrase): cleaned_doc = [] for word in str(self.doc).split(): if word.lower() in self.replacement_list.keys(): cleaned_doc.append(self.replacement_list[word.lower()]) else: cleaned_doc.append(word) phrase = ' '.join(cleaned_doc) return phrase def contractions_fun(self): """ This function replaces words that are -- by checking a word if a word is present in a dictionary if the word is present in dictionary then it is replaced with its value from dictionary """ if self.contraction_method == 'mapping': self.doc = self.mapping_decontraction(str(self.doc)) elif self.contraction_method == 'word2vec': model = pretrained_model cont = Contractions(model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0] elif self.contraction_method == 'glove': model = api.load("glove-twitter-25") cont = Contractions(kv_model=model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0] def remove_numbers_fun(self): """ This function uses regex to remve all the numbers from the doc. """ self.doc = re.sub("[0-9]", "", self.doc) self.doc = self.doc.strip() self.doc = " ".join(self.doc.split()) def autocorrect_fun(self): spell = Speller(lang='en') self.doc = [spell(w) for w in word_tokenize(self.doc)] def remove_html_tags_fun(self): """ This function uses regex's complile method to remove all the HTML tags from the doc """ cleaner = re.compile('<.*?>') cleaned_text = re.sub(cleaner, '', self.doc) cleaned_text = re.sub('[\n\t]', '', cleaned_text) self.doc = cleaned_text.strip() self.doc = " ".join(self.doc.split()) def remove_punctations_fun(self): """ This function uses regex to remove alk the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc) self.doc = self.doc.strip() self.doc = " ".join(self.doc.split()) def remove_accented_chars_fun(self): """remove accented characters from text, e.g. café""" self.doc = unidecode.unidecode(self.doc) def remove_whitespace_fun(self): """remove extra whitespaces from text""" text = self.doc.strip() self.doc = " ".join(text.split()) def tokenize_word_fun(self): """tokenizes the sentences to words""" self.doc = word_tokenize(self.doc) # def tokenize_sent_fun(self): # """tokenizes the paragraphs to sentences""" # self.sents = sent_tokenize(self.doc) def lemmatize_fun(self): """ This function applies the stemming to the words It can be operated with either WordNetLemmatizer or Snowball Stemmer --------------------------- Example: lemmatize(method='snowball') default value = 'wordnet """ cleaned_tokens = None if self.lemmatize_method == 'wordnet': cleaned_tokens = [ self.lemmatizer.lemmatize(token) for token in self.doc ] elif self.lemmatize_method == 'snowball': cleaned_tokens = [ self.lemmatizer.stem(token) for token in self.doc ] self.doc = ' '.join(cleaned_tokens) def add_stopword(self, *args): """ This function is used to add new stopwords to the predefined list Parameters - ["new_stopword"] ------------------------------ Example - obj = NLP() obj.add_stopword(["first_word", "second_word"]) """ if self.remove_stopwords is False: raise Exception("Please enable removal of stopwords") if type(args) != list: raise Exception("Error - pass stopwords in list") for arg in args: self.stopword_list.add(arg) def print_stopwords(self): """ This function prints all the stopwords that are present in the list Return Type - list ------------------------------ Example obj = NLP() obj.print_stopwords() """ if self.stopword_list == []: raise Exception("Error - stopword list is empty") print(self.stopword_list) def process(self): """ This function processes the doc If the remove_stopwords flag is True - it will remove stopwords from doc If the clean_words flag is True - it will clean the doc by replacing words Parameters - [doc] ------------------------------ Example obj = NLP() obj.process(["process this text"]) How to use with pandas? obj = NLP() df = df['text].apply(obj.process) """ if self.file_path != None: data = self.file_reader() if self.doc_link != None: self.doc_downloader(self.doc_link, "docx", "testing_document") path = file_storage + '/testing_document.docx' data = prepare_text(path, dolower=False) if self.folder_link != None: self.folder_downloader(self.folder_link) data = 'test' output = [] self.sents = sent_tokenize(data) for doc in self.sents: self.doc = doc if self.lower is True: self.lower_fun() if self.contractions is True: self.contractions_fun() if self.remove_html_tags is True: self.remove_html_tags_fun() if self.remove_numbers is True: self.remove_numbers_fun() if self.remove_punctations is True: self.remove_punctations_fun() if self.remove_accented_chars is True: self.remove_accented_chars_fun() if self.remove_stopwords is True: self.remove_stopwords_fun() if self.remove_whitespace is True: self.remove_whitespace_fun() if self.auto_correct is True: self.autocorrect_fun() if self.lemmatize is True: self.lemmatize_fun() if self.tokenize_word is True: self.tokenize_word_fun() if self.word_embedding is True: self.word_embedding_fun() output.append(self.doc) return output
def stem_tokenize(text): stemmer = SnowballStemmer("english") stemmer = WordNetLemmatizer() return [stemmer.lemmatize(token) for token in word_tokenize(text)]
class Lemmatizer(PreprocessingStep): COLUMN_LEMMA = "lemma" class LemmatizerType(Enum): english_nltk = 1 english_spacy = 2 italian_nltk = 3 # is a stemmer, nltk does not have an italian lemmatizer italian_spacy = 4 def __init__(self, lemmatizer_type=LemmatizerType.english_nltk): self._lemmatizer_type = lemmatizer_type self._lemmatizer = None if lemmatizer_type == self.LemmatizerType.english_nltk: self._lemmatizer = WordNetLemmatizer() elif lemmatizer_type == self.LemmatizerType.english_spacy: # Use precalculated files for spacy since google colab can't handle fasttext model and spacy lemmatizer at once self._lemmatizer = FileUtil.read_csv_to_dataframe( Paths.PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV) elif lemmatizer_type == self.LemmatizerType.italian_nltk: self._lemmatizer = SnowballStemmer("italian") elif lemmatizer_type == self.LemmatizerType.italian_spacy: # Use precalculated files for spacy since google colab can't handle fasttext model and spacy lemmatizer at once self._lemmatizer = FileUtil.read_csv_to_dataframe( Paths.PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV) else: log.error(f"Unknown case for LemmatizerType: {lemmatizer_type}") def execute(self, text_tokens, file_name, javadoc): if self._lemmatizer_type == self.LemmatizerType.english_nltk: return [self._lemmatizer.lemmatize(token) for token in text_tokens] elif self._lemmatizer_type == self.LemmatizerType.english_spacy or self._lemmatizer_type == self.LemmatizerType.italian_spacy: return [ self._lemmatizer.at[token, self.COLUMN_LEMMA] if token in self._lemmatizer.index else token for token in text_tokens ] if self._lemmatizer_type == self.LemmatizerType.italian_nltk: return [self._lemmatizer.stem(token) for token in text_tokens] @classmethod def _precalculate_spacy_lemmatizer(cls, spacy_lemmatizer, dataset_tuple, output_path): word_to_lemma_map = {} def iterate_files(tokenizer, preprecessor, folder): for file in FileUtil.get_files_in_directory(folder, True): file_representation = tokenizer.tokenize(file) file_representation.preprocess(preprecessor) for word in file_representation.token_list: lemma = [token.lemma_ for token in spacy_lemmatizer(word)] if len(lemma) > 1: log.info( f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma" ) lemma = "".join(lemma) if word in word_to_lemma_map: if not word_to_lemma_map[word] == lemma: log.info( f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}" ) else: word_to_lemma_map[word] = lemma for dataset, code_pre, code_tok, req_pre, req_tok in dataset_tuple: iterate_files(req_tok, req_pre, dataset.req_folder()) iterate_files(code_tok, code_pre, dataset.code_folder()) word_to_lemma_dataframe = pandas.DataFrame.from_dict( word_to_lemma_map, orient="index", columns=[cls.COLUMN_LEMMA]) FileUtil.write_dataframe_to_csv(word_to_lemma_dataframe, output_path) @classmethod def precalculate_spacy_english_lemmatizer(cls, dataset_tuple): cls._precalculate_spacy_lemmatizer( en_core_web_trf.load(disable=['ner', 'parser']), dataset_tuple, Paths.PRECALCULATED_SPACY_ENGLISH_LEMMA_CSV ) # we only need the lemmatizer component, disable the other @classmethod def precalculate_spacy_italian_lemmatizer(cls, dataset_tuple): cls._precalculate_spacy_lemmatizer( it_core_news_lg.load(disable=['ner', 'parser']), dataset_tuple, Paths.PRECALCULATED_SPACY_ITALIAN_LEMMA_CSV)
class NLP(): def __init__(self, remove_stopwords=True, replace_words=True, remove_numbers=True, remove_html_tags=True, remove_punctuations=True, lemmatize=False, lemmatize_method='wordnet'): """ This package contains functions that can help during the preprocessing of text data. :param remove_stopwords: boolean default value = True :param replace_words: boolean default value = True """ if (type(remove_stopwords) != bool or type(replace_words) != bool or type(remove_numbers) != bool or type(remove_html_tags) != bool or type(remove_punctuations) != bool or type(lemmatize) != bool): raise Exception("Error - expecting a boolean parameter") if lemmatize_method not in ['wordnet', 'snowball']: raise Exception("Error - lemmatizer method not supported") self.doc = None self.tweets = None self.lemmatizer = None self.remove_stopwords = remove_stopwords self.replace_words = replace_words self.remove_numbers = remove_numbers self.remove_html_tags = remove_html_tags self.remove_punctations = remove_punctuations self.lemmatize_method = lemmatize_method self.lemmatize = lemmatize self.stopword_list = set(stopwords) self.replacement_list = to_replace if self.lemmatize_method == 'wordnet': self.lemmatizer = WordNetLemmatizer() if self.lemmatize_method == 'snowball': self.lemmatizer = SnowballStemmer('english') def remove_stopwords_fun(self): """ This function removes stopwords from doc. It works by tokenizing the doc and then checking if the word is present in stopwords """ tokens = str(self.doc).split() cleaned_tokens = [ token for token in tokens if token.lower() not in self.stopword_list ] self.doc = ' '.join(cleaned_tokens) def replace_words_fun(self): """ This function replaces words that are -- by checking a word if a word is present in a dictionary if the word is present in dictionary then it is replaced with its value from dictionary """ cleaned_doc = [] for word in str(self.doc).split(): if word.lower() in self.replacement_list.keys(): cleaned_doc.append(self.replacement_list[word.lower()]) else: cleaned_doc.append(word) self.doc = ' '.join(cleaned_doc) def remove_numbers_fun(self): """ This function uses regex to remve all the numbers from the doc. """ self.doc = re.sub("[0-9]", "", self.doc) def remove_html_tags_fun(self): """ This function uses regex's complile method to remove all the HTML tags from the doc """ cleaner = re.compile('<.*?>') cleaned_text = re.sub(cleaner, '', self.doc) cleaned_text = re.sub('[\n\t]', '', cleaned_text) self.doc = cleaned_text def remove_punctations_fun(self): """ This function uses regex to remove alk the punctations from the doc. """ self.doc = re.sub('[^a-zA-Z0-9]', ' ', self.doc) def lemmatize_fun(self): """ This function applies the stemming to the words It can be operated with either WordNetLemmatizer or Snowball Stemmer --------------------------- Example: lemmatize(method='snowball') default value = 'wordnet """ tokens = str(self.doc).split() cleaned_tokens = None if self.lemmatize_method == 'wordnet': cleaned_tokens = [ self.lemmatizer.lemmatize(token) for token in tokens ] else: cleaned_tokens = [self.lemmatizer.stem(token) for token in tokens] self.doc = ' '.join(cleaned_tokens) def add_stopword(self, *args): """ This function is used to add new stopwords to the predefined list Parameters - ["new_stopword"] ------------------------------ Example - obj = NLP() obj.add_stopword(["first_word", "second_word"]) """ if self.remove_stopwords is False: raise Exception("Please enable removal of stopwords") if type(args) != list: raise Exception("Error - pass stopwords in list") for arg in args: self.stopword_list.add(arg) def add_replacement(self, *args): """ This function is used to add new replacement words to the predifined list Parameters - [ = ""] ---------------------------- Example - obj = NLP() obj.add_replacement([first: "replacement1", second: "replacement2"]) """ if self.replace_words is False: raise Exception("Please enable cleaning of stopwords") if type(args) != list: raise Exception("Error - pass input parameters in list") if args == []: raise Exception("Error - list is empty") try: for key, value in args.items(): self.replacement_list[key] = value except: print("Expected args in dict format") def remove_stopwords(self, *args): """ This function is used to remove stopwords from predefined list Parameters - ["first_word"] ------------------------------ Example obj = NLP() obj.remove_stopwords(['new_stopword_here']) """ if self.remove_stopwords is False: raise Exception("Error - enable stopword removal functionality") if type(args) != list: raise Exception("Error - expected a list") if args == []: raise Exception("Error - no items to remove from stopword list") for arg in args: if arg in self.stopword_list: self.stopword_list.remove(arg) else: raise Exception(arg + " not in list") def print_stopwords(self): """ This function prints all the stopwords that are present in the list Return Type - list ------------------------------ Example obj = NLP() obj.print_stopwords() """ if self.stopword_list == []: raise Exception("Error - stopword list is empty") print(self.stopword_list) def process(self, doc): """ This function processes the doc If the remove_stopwords flag is True - it will remove stopwords from doc If the clean_words flag is True - it will clean the doc by replacing words Parameters - [doc] ------------------------------ Example obj = NLP() obj.process(["process this text"]) How to use with pandas? obj = NLP() df = df['text].apply(obj.process) """ self.doc = doc if self.replace_words is True: self.replace_words_fun() if self.remove_html_tags is True: self.remove_html_tags_fun() if self.remove_stopwords is True: self.remove_stopwords_fun() if self.remove_numbers is True: self.remove_numbers_fun() if self.remove_punctations is True: self.remove_punctations_fun() if self.lemmatize is True: self.lemmatize_fun() return self.doc def processTweet(self, tweets): """ Expects tweets to be a pandas series Example use-case: tweets = processTweet(tweets) ______________________________________ • Lower-casing • Normalizing URLs • Normalizing Tags and email addresses • Normalizing Numbers • Normalizing Dollars • Normalize punctuation • Removal of composition • Removal of punctuation • Word Stemming (Porter Stemmer) """ self.tweets = tweets # Lower case text tweets = tweets.str.lower() # Account Tag @theFakeDonaldTrump tweets = tweets.str.replace(r"@[^\s]+", 'idaddr') # Email address tweets = tweets.str.replace(r"[^\s]+@[^\s]+", 'emailaddr') # Handle URLS # Look for strings starting with http:// or https:// tweets = tweets.str.replace(r"(http|https)://[^\s]*", 'httpaddr') # Handle Numbers # Look for one or more characters between 0-9 tweets = tweets.str.replace(r"[0-9]+", 'number') # Handle $ sign tweets = tweets.str.replace(r"[$]+", 'dollar') # Normalize punctuation transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”–-", u"'''\"\"--")]) tweets = tweets.apply(lambda a: a.translate(transl_table)) # Expand Contractions tweets = tweets.apply(lambda string: " ".join([ to_replace[i] if i in to_replace.keys() else i for i in string.split() ])) # Handle punctuation tweets = tweets.str.replace(r"[^\w]+", ' ') # Stem stemmer = nltk.stem.PorterStemmer() tweets = tweets.apply(lambda a: list(map(stemmer.stem, a.split()))) return tweets