def regnltk_tokenize(text): lemmatizer = WordNetLemmatizer() text_clean = clean_text(text) words = regexp_tokenize(text_clean, pattern='\s+', gaps=True) return [ lemmatizer.lemmatize(word) for word in words if (len(word) >= 3) ]
def format_sentence(self, sent): """Tokenize sentence and return format that can work with NLTK.NaiveBayesClassifier.""" return { word: True for word in regexp_tokenize(sent, pattern='\w+') if word not in stopwords.words('english') }
def sample_preprocess( sample: str, header: str) -> Tuple[SampleLabelled, SampleLabelledTokenised]: """Pre-process data and format them in structure that is easy to pass to other methods Parameters ---------- sample : str The sample you want to process header : str The label you want to apply to the sample """ # """Clean sample""" temp = sample.lower() # Remove all occurences of square brackets and everything in between temp = re.sub("\[.*?\]", "", temp) # Focus on words, disregard numbers, etc. temp_tokens = regexp_tokenize(temp, r"[a-zA-z]+") # Remove stop words temp_tokens = [word for word in temp_tokens if not word in stopwords] # Create labelled sample sample_labelled = (' '.join(temp_tokens), header) # Create labelled tokenised sample sample_labelled_tokenised = (temp_tokens, header) return sample_labelled, sample_labelled_tokenised
def analyse(text): global NUM_SECS global NUM_SENTENCES global NUM_WORDS global NUM_STOP_WORD NUM_SECS += len(text.split("\n\n")) text = re.sub('@|#|:|\n|-|’', ' ', text) tokens = regexp_tokenize(text.lower().replace("'", " "), pattern='\w+|\$[\d\.]+|\S+') terms = set() lexemes = set() for token in tokens: if token in punctuation or token == " ": if token in ['!', '.', '?']: NUM_SENTENCES += 1 continue # ignore punctuation elif token in stop: NUM_WORDS += 1 NUM_STOP_WORD += 1 else: NUM_WORDS += 1 terms.add(token) lexemes.add(lemmatizer.lemmatize(token))
def count_words(text): global num_words text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text) for token in regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+'): if token in punctuation or token == " ": continue # ignore punctuation elif token in stop: num_words += 1 else: num_words += 1
def __getitem__(self, idx): x = regexp_tokenize(self.x[idx], pattern="\s+", gaps=True) y = self.y_dict[self.y[idx]] x = [self.questions.index(word) for word in x] if len(x) > self.max_len: x = x[:self.max_len] else: x = x + [self.questions.index("padding")] * (self.max_len - len(x)) return torch.tensor(x), torch.tensor(y)
def tokenize_text(text, clean=False): """ text: text to tokenize output: list of words """ if clean: text = clean_text(text) words = regexp_tokenize(text, pattern="\s+", gaps=True) return (words)
def Tokenize(s): global hash_stem words = regexp_tokenize(s,pattern="[a-z]+ | [0-9]+") ret = [] for word in words: if(len(word)>2 and len(word)<41 and word not in cachedStopWords): word = word.strip() if word not in hash_stem: hash_stem[word] = stemmer.stem(word) ret.append(hash_stem[word]) # if(len(hash_stem) == 100000000): # print(len(hash_stem)) # hash_stem = {} return ret
def process_data(fileName): category = fileName.replace('.txt', '').replace('data/', '') data = open(fileName, 'r').read().split("\n|||\n") data_original = [] data_tokenized = [] for text in data: text = text.lower() text = re.sub("\[.*?\]", "", text) text_tokens = regexp_tokenize(text, r"[a-zA-z]+") text_tokens_ns = [ word for word in text_tokens if not word in stopwords ] data_original.append((' '.join(text_tokens_ns), category)) data_tokenized.append((text_tokens_ns, category)) return data, data_original, data_tokenized
def normalize_section_nltk_fast(text): offset = 1 # position value doc_lexemes = {} doc_offsets = defaultdict(list) text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text) for token in regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+'): if token in punctuation or token == " ": continue # ignore punctuation elif token in stop: offset += 1 # increase offset but don't save else: doc_lexemes[token] = lemmatizer.lemmatize(token) doc_offsets[token].append(offset) offset += 1
def custom_tokenizer(self, text, tokenizer='nltk_regexp'): text = self.clean_text(text) if (tokenizer == 'nltk'): tokens = word_tokenize(text) elif (tokenizer == 'gensim'): tokens = gensim.utils.simple_preprocess(str(tokens), deacc=True) else: tokens = regexp_tokenize(text, pattern='\s+', gaps=True) for token in tokens: if len(token) > 512: print('TOKER > 512', token) token = token[:512] return tokens
def process(self, item): """Process item - Reduce inflectional forms to a common base form. Args: item (dict): item Returns: dict: Returns the updated item """ try: self._log.debug("Lemmatizer Step") text = regexp_tokenize(item["data"], pattern="\s+", gaps=True) text = " ".join([self._lemmatizer.lemmatize(w) for w in text]) item["data"] = text except Exception as e: self._log.error("Error with lemmatizer on item id:{} - {}".format( item["id"], e)) return item
def process(self, item): """Process Item - Tokenize text into tokens. Args: item (dict): item Returns: dict: Returns the updated item """ try: self._log.debug("NLTK Regex Tokenize Step") text = regexp_tokenize(item["data"], pattern="\s+", gaps=True) item["data"] = text except Exception as e: self._log.error( "Error in NLTK Regex Tokenize from item id:{} - {}".format( item["id"], e)) return item
def normalize_doc(doc): paragraphs = doc.split("\n\n") secs = [] for para in paragraphs: words = set() para = re.sub( ",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", para) for token in regexp_tokenize(para.lower(), pattern='\w+|\$[\d\.]+|\S+'): if token in punctuation or token == " ": continue # ignore punctuation elif token in stop: pass else: words.add(token) secs.append(words) return secs
def _generate_phrases(self, sentences): """Method to generate contender phrases given the sentences of the text document. :param sentences: List of strings where each string represents a sentence which forms the text. :return: Set of string tuples where each tuple is a collection of words forming a contender phrase. """ phrase_list = set() # Create contender phrases from sentences. for sentence in sentences: word_list = [ word.lower() for word in regexp_tokenize(sentence, pattern='\w+|\$[\d\.]+') ] print(word_list) phrase_list.update(self._get_phrase_list_from_words(word_list)) return phrase_list
def process(self, item): """Process item - Expand Contractions step. Args: item (dict): item Returns: dict: Returns the updated item """ try: self._log.debug("Expand Contractions Step") text = regexp_tokenize(item["data"], pattern="\s+", gaps=True) # noqa for index, word in enumerate(text): if CONTRACTIONS.get(word): text[index] = CONTRACTIONS[word] item["data"] = " ".join(text) except Exception as e: self._log.error("Error debugging (item id:{}) - {}".format( item["id"], e)) return item
def normalize_section_nltk_pos_tag(text): offset = 1 # position value doc_lexemes = {} doc_offsets = defaultdict(list) text = re.sub(",|/|u'\u200b'|‘|—|<|>|@|#|:|\n|\"|\[|\]|\(|\)|-|“|”|’|'|\*", " ", text) for token, tag in pos_tag(regexp_tokenize(text.lower(), pattern='\w+|\$[\d\.]+|\S+')): if token in punctuation or token == " ": continue # ignore punctuation elif token in stop: offset += 1 # increase offset but don't save else: lemma = "" tag = tag[0].lower() tag = tag if tag in ['a', 'r', 'n', 'v'] else None if not tag: lemma = token else: lemma = lemmatizer.lemmatize(token, tag) doc_lexemes[token] = lemma doc_offsets[token].append(offset) offset += 1 num_words = offset - 1 return (doc_lexemes, doc_offsets, num_words)
def nltk_regexp_tokenize(raw_corpus): # regular expression pattern includes punctuation re_pattern = '\w+|\$[\d\.]+|\S+' return [regexp_tokenize(doc, re_pattern) for doc in raw_corpus]
from nltk.tokenize.regexp import regexp_tokenize from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split # https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568 # List of stopwords all_stopwords = set(stopwords.words('english')) pos_data = open("allergies.txt", "r").read().split("\n|||\n") pos_document = [] pos_p = [] for text in pos_data: text = text.lower() text = re.sub("\[.*?\]", "", text) text_tokens = regexp_tokenize(text, r"[a-zA-z]+") text_tokens_ns = [word for word in text_tokens if not word in all_stopwords] pos_p.append((' '.join(text_tokens_ns), 'allergies')) pos_document.append((text_tokens_ns, 'allergies')) neg_data = open("social_history.txt", "r").read().split("\n|||\n") neg_document = [] neg_p = [] for text in neg_data: text = text.lower() text = re.sub("\[.*?\]", "", text) text_tokens = regexp_tokenize(text, r"[a-zA-z]+") text_tokens_ns = [word for word in text_tokens if not word in all_stopwords] neg_p.append((' '.join(text_tokens_ns), 'social_history')) neg_document.append((text_tokens_ns, 'social_history'))
def renltk_tokenize(text): return regexp_tokenize(text, pattern='\s', gaps=True)
#!/usr/bin/python # coding=utf-8 # -*- encoding: utf-8 -*- # Programa que separa les lletres d'una paraula amb espais #sys.setdefaultencoding('utf-8'); import sys; from nltk.tokenize.regexp import regexp_tokenize; from sys import stdin; from sys import stderr; from codecs import getreader; from codecs import getwriter; stdin = getreader('utf-8')(stdin); sys.stdout = getwriter('utf-8')(sys.stdout); stderr = getwriter('utf-8')(stderr); s = stdin.read(); tokens = regexp_tokenize(s,'\w+|\$[\d\.]+'); for token in tokens: print " ".join(regexp_tokenize(token,'\w')) + '.';
import sys from nltk.probability import FreqDist from nltk.tokenize.regexp import regexp_tokenize import re lines = sys.stdin.readlines() N = int(lines[0]) data = lines[1:] for d in data: if d.strip() == "": continue sent = FreqDist(word.lower() for word in regexp_tokenize(d, pattern=r'[a-zA-Z0-9]+')) print(sent["a"]) print(sent["an"]) print(sent["the"]) seperator = r'[ \\/,.-]+' datePattern = r'\d{1,2}(?:st|nd|rd|th)*' monthPattern = r'(?:\d\d|jan|feb|mar|apr|may|jun|july|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)+' yearPattern = r'\d{2,4}' pattern = datePattern + seperator + monthPattern + seperator + yearPattern # print(re.findall(pattern, d, re.IGNORECASE)); dates = len(re.findall(pattern, d, re.IGNORECASE)) pattern = monthPattern + seperator + datePattern + seperator + yearPattern