def __normalization_dictionary(self): """This returns the output Returns: the dictionary """ self.data['original_text'] = self.text self.data['text_lowercase'] = self.text.lower() self.data['text_to_process'] = self.text_to_process if self.tokens_without_stop_words: self.data['text_without_stop_words'] = ' '.join( self.tokens_without_stop_words) else: self.data['text_without_stop_words'] = None self.data['tokens'] = self.tokens self.data['tokens_without_stop_words'] = self.tokens_without_stop_words self.data['stems'] = self.stems self.data['stems_without_stop_words'] = self.stems_without_stop_words self.data['lemmas'] = self.lemmas self.data['lemmas_without_stop_words'] = self.lemmas_without_stop_words self.data['trigram'] = self.trigram self.data['bigram'] = self.bigram self.data['unigram'] = self.unigram nlp_logger.debug('=== dictionary created ===') return self.data
def __set_variables_to_none(self): """This function sets variables like self.text_to_process, self.tokens, self.tokens_without_stop_words, self.stems, self.stem_without_stop_words, self.lemmas, self.lemmas_without_stop_words, self.trigram, self.bigram, self.unigram to None """ self.text_to_process = None self.tokens = [] self.tokens_without_stop_words = [] self.stems = [] self.stems_without_stop_words = [] self.lemmas = [] self.lemmas_without_stop_words = [] self.trigram = [] self.bigram = [] self.unigram = [] self.data = {} nlp_logger.debug('=== variables are set to none ===')
def ngram_data(self, text, flag_punctuation_removal=True, stem_unigram=True, stem_bigram=True, stem_trigram=True, stop_words_unigram=True, stop_words_bigram=True, stop_words_trigram=True): """Gets Ngram information of a text Args: text: message that needs to be processed flag_punctuation_removal: True if punctuations need to be removed from message stem_unigram: True if list of unigrams needs to be stemmed stem_bigram: True if list of bigrams needs to be stemmed stem_trigram: True if list of trigrams needs to be stemmed stop_words_unigram: True if stop words need to be removed from unigram list stop_words_bigram: True if stop words need to be removed from bigram list stop_words_trigram: True if stop words need to be removed from trigram list Returns: Returns a dictionary that contains output For example: n=Normalization() message = 'I am playing cricket' output=n.ngram_data(text=message, flag_punctuation_removal=True, stem_unigram=True, stem_bigram=True, stem_trigram=True, stop_words_unigram=True, stop_words_bigram=True, stop_words_trigram=True) print output >> {'bigram': [u'i am', u'am play', u'play cricket'], 'lemmas': [], 'lemmas_without_stop_words': [], 'original_text': 'I am playing cricket', 'stems': [u'i', u'am', u'play', u'cricket'], 'stems_without_stop_words': [u'i', u'am', u'play', u'cricket'], 'text_lowercase': 'i am playing cricket', 'text_to_process': 'i am playing cricket', 'text_without_stop_words': 'i am playing cricket', 'tokens': ['i', 'am', 'playing', 'cricket'], 'tokens_without_stop_words': ['i', 'am', 'playing', 'cricket'], 'trigram': [u'i am play', u'am play cricket'], 'unigram': [u'i', u'am', u'play', u'cricket']} """ self.__set_variables_to_none() self.text = text if flag_punctuation_removal: self.text_to_process = self.regx_to_process.text_substitute( self.text) else: self.text_to_process = self.text nlp_logger.debug('=== punctuation removal is %s ===' % flag_punctuation_removal) self.text_to_process = self.text_to_process.lower() self.tokens = self.tokenizer.tokenize(self.text_to_process) self.tokens_without_stop_words = filter_list(self.tokens, self.stop_word_list) self.stems = self.stemmer.stem_tokens(self.tokens) self.stems_without_stop_words = self.stemmer.stem_tokens( self.tokens_without_stop_words) if self.tokens or self.stems: self.trigram = self.__get_ngram(3, stem_trigram, stop_words_trigram) self.bigram = self.__get_ngram(2, stem_bigram, stop_words_bigram) self.unigram = self.__get_ngram(1, stem_unigram, stop_words_unigram) nlp_logger.debug('=== Ngrams are set ===') else: nlp_logger.debug('=== Ngram can not be set ===') return self.__normalization_dictionary()
def __process_text(self, regx_to_process=None): """This function contains all the functionality i.e. text processing, tokenization, stemming , Ngrams Args: regx_to_process: regex object that can be used to process the text Returns: Returns a dictionary that contains output """ if not regx_to_process: regx_to_process = self.regx_to_process nlp_logger.debug( '=== selected default punctuation regular expression ===') if self.flag_punctuation_removal: self.text_to_process = regx_to_process.text_substitute(self.text) else: self.text_to_process = self.text nlp_logger.debug('=== punctuation removal is %s ===' % self.flag_punctuation_removal) self.text_to_process = self.text_to_process.lower() self.tokens = self.tokenizer.tokenize(self.text_to_process) nlp_logger.debug('=== tokens are identified ===') if self.flag_tokens_without_stop_words: self.tokens_without_stop_words = filter_list( self.tokens, self.stop_word_list) nlp_logger.debug('=== tokens without stop words is %s ===' % self.flag_tokens_without_stop_words) if self.flag_stem and self.tokens: self.stems = self.stemmer.stem_tokens(self.tokens) nlp_logger.debug('=== stemmer is %s ===' % self.flag_stem) if self.flag_stem_without_stop_words: if self.tokens_without_stop_words: self.stems_without_stop_words = self.stemmer.stem_tokens( self.tokens_without_stop_words) else: self.stems_without_stop_words = self.stemmer.stem_tokens( filter_list(self.tokens, self.stop_word_list)) nlp_logger.debug('=== stemmer without stop words is %s ===' % self.flag_stem_without_stop_words) if self.flag_lemma and self.tokens: self.lemmas = self.lemmatizer.lemmatize_tokens(self.tokens) nlp_logger.debug('=== lemma is %s ===' % self.flag_lemma) if self.flag_lemma_without_stop_words: if self.tokens_without_stop_words: self.lemmas_without_stop_words = self.lemmatizer.lemmatize_tokens( self.tokens_without_stop_words) else: self.lemmas_without_stop_words = self.lemmatizer.lemmatize_tokens( filter_list(self.tokens, self.stop_word_list)) nlp_logger.debug('=== lemma without stop word is %s ===' % self.flag_lemma_without_stop_words) if self.stems or self.tokens: self.trigram = self.__get_ngram(3, self.stem_trigram, self.stop_words_trigram) self.bigram = self.__get_ngram(2, self.stem_bigram, self.stop_words_bigram) self.unigram = self.__get_ngram(1, self.stem_unigram, self.stop_words_unigram) nlp_logger.debug('=== Ngrams are set ===') else: nlp_logger.debug('=== Ngram can not be set ===')
def preprocess_data(self, text, flag_punctuation_removal=True, flag_tokens=True, flag_stems=True, flag_lemma=True): """Performs tokenization, stemming, taggig and lemmatization on text Args: flag_punctuation_removal: True if punctuations need to be removed from message flag_tokens: True if token need to obtained from a message flag_stems: True if stems need to obtained from a message flag_lemma: True if lemma need to obtained from a message Returns: Returns a dictionary that contains output For example: n=Normalization() message = 'I am playing cricket' output=n.preprocess_data(text=message,flag_punctuation_removal=True,flag_tokens=True,flag_stems=True, flag_lemma=True) print output >> {'bigram': [], 'lemmas': ['i', 'am', 'playing', 'cricket'], 'lemmas_without_stop_words': ['i', 'am', 'playing', 'cricket'], 'original_text': 'I am playing cricket', 'stems': [u'i', u'am', u'play', u'cricket'], 'stems_without_stop_words': [u'i', u'am', u'play', u'cricket'], 'text_lowercase': 'i am playing cricket', 'text_to_process': 'i am playing cricket', 'text_without_stop_words': 'i am playing cricket', 'tokens': ['i', 'am', 'playing', 'cricket'], 'tokens_without_stop_words': ['i', 'am', 'playing', 'cricket'], 'trigram': [], 'unigram': []} """ self.__set_variables_to_none() self.text = text if flag_punctuation_removal: self.text_to_process = self.regx_to_process.text_substitute( self.text) else: self.text_to_process = self.text nlp_logger.debug('=== punctuation removal is %s ===' % flag_punctuation_removal) self.text_to_process = self.text_to_process.lower() if flag_tokens: self.tokens = self.tokenizer.tokenize(self.text_to_process) self.tokens_without_stop_words = filter_list( self.tokens, self.stop_word_list) nlp_logger.debug('=== token is %s ===' % flag_tokens) if flag_stems: if not self.tokens: self.tokens = self.tokenizer.tokenize(self.text_to_process) self.stems = self.stemmer.stem_tokens(self.tokens) self.stems_without_stop_words = self.stemmer.stem_tokens( filter_list(self.tokens, self.stop_word_list)) nlp_logger.debug('=== stem is %s ===' % flag_stems) if flag_lemma: if not self.tokens: self.tokens = self.tokenizer.tokenize(self.text_to_process) self.lemmas = self.lemmatizer.lemmatize_tokens(self.tokens) self.lemmas_without_stop_words = self.lemmatizer.lemmatize_tokens( filter_list(self.tokens, self.stop_word_list)) return self.__normalization_dictionary()