def clean_text(self, text): """This function is used to clean text for sentimental analysis @Author: Adarsh Koppa Manjunath @Parameters: text(str): text to be cleaned @return final_output(dict): url and serach result""" try: #remove square brackets text = re.sub('\[[^]]*\]', '', text) #remove digits pattern = r'[^a-zA-z0-9\s]' text = re.sub(pattern, '', text) #steming the text ps = nltk.porter.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) #tokenization and stop words removal tokenizer = ToktokTokenizer() stopword_list = set(stopwords.words('english')) tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] filtered_tokens = [ token for token in tokens if token.lower() not in stopword_list ] return filtered_tokens except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return "exception: failed"
def tokenize(self, text, a_preserve_case=True, a_reduce_len=False, a_strip_handles=False): own_tokenizer = None tokens = [] own_extend = tokens.extend if self.__token_whitespace: tokens = text.split(" ") elif self.__language == "persian": own_tokenizer = ToktokTokenizer() for t in text: own_extend(own_tokenizer.tokenize(t)) else: own_tokenizer = nltk_data.load("tokenizers/punkt/" + self.__language + ".pickle") sents = own_tokenizer.tokenize(text) for sent in sents: own_extend(word_tokenize(sent, language=self.__language)) return tokens
def _tokenizer(self, x, quit_commons=True): """ Aplicar el tokenizado a una cadena de texto. Pasa a minúsculas, elimina caracteres especiales, stopwords, números, nombres propios. Args: x (str): Cadena que tokenizar. quit_common (bool): Si se desean eliminar también una lista de palabras comunes. Por defecto: True. Returns: list: Lista de tokens. """ toktok = ToktokTokenizer() common_words = [] if quit_commons: common_words = commons x_lower = x.lower().replace("o dos", "o2") tokens_not_filter = [ unidecode(item.lower()) for item in toktok.tokenize(x) ] tokens = [ item for item in tokens_not_filter if item not in stopwords.words('spanish') and item not in numwords and item not in common_words and item not in names and len(item) > 2 ] return tokens
def remove_stopwords(text): ''' text should be in lower case Input: "The, and, if are stopwords, computer is not" Output: ", , stopwords , computer not" ''' stopwords_sklrn = frozenset(text.ENGLISH_STOP_WORDS) stopwords_nltk = frozenset(stopwords.words('english')) stopwords_wrdcld = frozenset(STOPWORDS) all_stopwords = frozenset( pd.Series( list(stopwords_sklrn) + list(stopwords_nltk) + list(stopwords_wrdcld)).unique()) # print('# of stopwords in each lib: ',len(stopwords_sklrn), len(stopwords_nltk), len(stopwords_wrdcld)) # print('# of stopwords when aggregated:', len(all_stopwords)) ## Removing some words from stopwords stopword_list = list(all_stopwords) excpt_stopword = ['no', 'not'] for ele in excpt_stopword: stopword_list.remove(ele) tokenizer = ToktokTokenizer() tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] filtered_tokens = [ token for token in tokens if token.lower() not in stopword_list ] filtered_text = ' '.join(filtered_tokens) return filtered_text
def tokenize_sentence(sentence, lang=None, punctList=None): if lang == None: lang = 'English' if punctList == None: punctList = [';', ':', ',', '.', '...', '``', "''", '¡', '!', '¿', '?'] if lang == 'Spanish': nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.toktok import ToktokTokenizer toktok = ToktokTokenizer() if lang == 'Spanish': string = sentence.decode('utf-8') tokens = toktok.tokenize(string) words = [] for token in tokens: if not token in punctList: words.append(token) if lang == 'English': string = sentence try: tokens = nltk.word_tokenize(string) except: tokens = nltk.word_tokenize(string.decode('utf-8')) words = [] for token in tokens: if not token in punctList: words.append(token) return words
def lemmatize_text(text): lematizer = WordNetLemmatizer() toktok = ToktokTokenizer() text = ' '.join( [lematizer.lemmatize(word) for word in toktok.tokenize(text)]) return text
def __init__(self, seed=42, ngram_range=(1, 3)): self.seed = seed self.init_seed() self.ngram_range = ngram_range self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) self.clf = LinearSVC(multi_class="ovr") self.word_tokenizer = ToktokTokenizer()
def create_spanish_english_alignments(spa_file, eng_file, spa_trans_file): toktok = ToktokTokenizer() massalign_sentence_pairs = get_massalign_sentence_pairs(spa_trans_file, eng_file) ''' To map to original spanish segment, you can either store the translation at sentence level or use Gale church to get sentence alignments from the documents. ''' translation_sentence_pairs = sentence_align(spa_file, spa_trans_file, 0.97, 1.8) pairs = [] for eng_trans, eng_org in massalign_sentence_pairs: eng_simple_tok_1 = toktok.tokenize(eng_trans) spanish = '' prev_spa = '' for spa, eng in translation_sentence_pairs: eng_simple_tok_2 = toktok.tokenize(eng) I = len(set(eng_simple_tok_2).intersection(set(eng_simple_tok_1))) U = len(set(eng_simple_tok_2)) try: percent_overlap = float(I)/U if percent_overlap > 0.5 and spa!=prev_spa: spanish += spa prev_spa = spa break except: continue if spanish != '': pairs.append([spanish, eng_org]) return pairs
def prepareToClf(self, text): txt = str(text) # Tokenize tweets. Word splitting. exclusionList = [ r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '->' ] exclusions = '|'.join(exclusionList) txt = re.sub(exclusions, '', ''.join(txt).rstrip(), flags=re.MULTILINE) toktok = ToktokTokenizer() tokens = toktok.tokenize(txt) words = tokens words = [word.lower() for word in words] from stopwords_ca import get_stopwords emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) # hem agafat els stop_words de http://latel.upf.edu/morgana/altres/pub/ca_stop.htm (ens hem fet la nostra propia funció) stop_words = get_stopwords() words = [ emoji_pattern.sub(r'', w) for w in words if not w in stop_words ] # NO EMOJI table = str.maketrans('', '', ''.join([string.punctuation, "’"])) words = [w.translate(table) for w in words] import unidecode unaccented_string = unidecode.unidecode(','.join(words)) return self.tf_vectorizer.transform([unaccented_string]).toarray()
def vectorizerV2(raw_text, vectorWords): toktok = ToktokTokenizer() tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') sentences = tokenizer.tokenize(raw_text) vector = [] counterCommas = 0 counterPoints = raw_text.count(".") countersWordsInSentence = [] for sentence in sentences: counterCommas += sentence.count(",") countersWordsInSentence.append(len(toktok.tokenize(sentence))) for token in toktok.tokenize(sentence): vectorWords[token] += 1 vector.append(counterCommas) vector.append(counterPoints) sumatory = 0 for counter in countersWordsInSentence: sumatory += counter averageWordsInSentence = sumatory / len(countersWordsInSentence) vector.append(averageWordsInSentence) vector.append(len(sentences)) for word, count in vectorWords.items(): vector.append(count) # número de comas | número de puntos | promedio de palabras por oración | número de oraciones | número de veces que aparece una palabra del conjunto completo en el texto ... return np.array(vector)
def vectorizer(raw_text): toktok = ToktokTokenizer() tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle') sentences = tokenizer.tokenize(raw_text) vector = [] counterCommas = 0 countersWordsInSentence = [] for sentence in sentences: counterCommas += sentence.count(",") countersWordsInSentence.append(len(toktok.tokenize(sentence))) vector.append(counterCommas) sumatory = 0 for counter in countersWordsInSentence: sumatory += counter averageWordsInSentence = sumatory / len(countersWordsInSentence) vector.append(averageWordsInSentence) vector.append(len(sentences)) return np.array(vector)
def __init__(self, stopwords: list = None, ngram_range: List[int] = None, lemmas=False, lowercase: bool = None, alphas_only: bool = None, **kwargs): """ :param stopwords: a set of words to skip :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to [1, 2], for bigrams only should be set to [2, 2] :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only for the English language :param lowercase: perform lowercasing or not :param alphas_only: should filter numeric and alpha-numeric types or not """ if ngram_range is None: ngram_range = [1, 1] self._stopwords = stopwords or [] self.tokenizer = ToktokTokenizer() self.lemmatizer = pymorphy2.MorphAnalyzer() self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only self.tok2morph = {}
def get_keywords(text): text_without_punct = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) toktok = ToktokTokenizer() texto_tokenized = toktok.tokenize(text_without_punct.lower()) keywords = [word for word in texto_tokenized if word not in stopwords.words('spanish')] return keywords
def __init__(self, stopwords: str): self.rgc = re.compile('[^a-zа-яё0-9-_]') self.tokenizer = ToktokTokenizer() self.stemmer = PorterStemmer() self.lemmatizer = pymorphy2.MorphAnalyzer() with open(stopwords, 'r') as f: self.stopwords = set(f.read().split('\n'))
def build_word_frequency(filepath, language, output_path): """ Parse the passed in text file (likely from Open Subtitles) into a word frequency list and write it out to disk Args: filepath (str): language (str): output_path (str): Returns: Counter: The word frequency as parsed from the file Note: This only removes words that are proper nouns (attempts to...) and anything that starts or stops with something that is not in the alphabet. """ # NLTK is only needed in this portion of the project try: from nltk.tag import pos_tag from nltk.tokenize import WhitespaceTokenizer from nltk.tokenize.toktok import ToktokTokenizer except ImportError as ex: raise ImportError( "To build a dictioary from scratch, NLTK is required!\n{}".format( ex.message)) word_frequency = Counter() if language == "es": tok = ToktokTokenizer() else: tok = WhitespaceTokenizer() idx = 0 with load_file(filepath, 'utf-8') as fobj: for line in fobj: # tokenize into parts parts = tok.tokenize(line) # Attempt to remove proper nouns # Remove things that have leading or trailing non-alphabetic characters. tagged_sent = pos_tag(parts) words = [ word[0].lower() for word in tagged_sent if word[0] and not word[1] == "NNP" and word[0][0].isalpha() and word[0][-1].isalpha() ] # print(words) if words: word_frequency.update(words) idx += 1 if idx % 100000 == 0: print("completed: {} rows".format(idx)) # end file loop print("completed: {} rows".format(idx)) export_word_frequency(output_path, word_frequency) return word_frequency
def buscar_palabras(dataset, palabras_dataset): tokenizador = ToktokTokenizer() palabras = tokenizador.tokenize(dataset) datos = {} for p in palabras_dataset: datos[p] = (p in palabras) return datos
def __init__(self): # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._tok = ToktokTokenizer() # self._tok = MosesTokenizer(lang='en') self._stemmer = SnowballStemmer('english') self._lemmatizer = TreeTagger(language='english') self._stopwords = set(open(STOPWORDS).read().splitlines()) # istopwords.words('french') # self._porter_stemmer = nltk.stem.porter.PorterStemmer()
def __init__(self, seed=42): self.seed = seed self.init_seed() self.is_loaded = False self.tokenizer = ToktokTokenizer() self.morph = morph self.count_vectorizer = CountVectorizer(ngram_range=(1, 4), tokenizer=str.split) self.classifier = CatBoostClassifier(verbose=0, use_best_model=True)
def __init__(self, seed=42): self.seed = seed self.init_seed() self.tokenizer = ToktokTokenizer() self.morph = pymorphy2.MorphAnalyzer() self.count_vectorizer = CountVectorizer(ngram_range=(1, 4), tokenizer=str.split) self.classifier = CatBoostClassifier(verbose=0, use_best_model=True) super().__init__()
def my_tokenizer(iterator): global max_len tknzr = ToktokTokenizer() for value in iterator: value = value.replace('-', " - ") value = value.replace('/', " / ") value = value.lower() value = tknzr.tokenize(value) max_len = max(max_len, len(value)) yield value
def tokenizar_dataset(frases_pos, frases_neg): tokens = [] palabras_vacias = stopwords.words('spanish') tokenizador = ToktokTokenizer() tokens_pos = tokenizador.tokenize(frases_pos) tokens_neg = tokenizador.tokenize(frases_neg) tokens.extend([t for t in tokens_pos if t not in palabras_vacias]) tokens.extend([t for t in tokens_neg if t not in palabras_vacias]) return tokens
def tokenizar(fileroute): toktok = ToktokTokenizer() esTokenizadorOraciones = nltk.data.load('tokenizers/punkt/spanish.pickle') f = open(fileroute, "r") contents = f.read() oraciones = esTokenizadorOraciones.tokenize(contents) data = [] for oracion in oraciones: for t in toktok.tokenize(oracion): data.append(t.lower()) return data
def remove_stopwords(text, is_lower_case=False): stopword_list = nltk.corpus.stopwords.words('english') tokenizer = ToktokTokenizer() tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [token for token in tokens if token not in stopword_list] else: filtered_tokens = [token for token in tokens if token.lower() not in stopword_list] filtered_text = ' '.join(filtered_tokens) return filtered_text
def remove_stopwords(text: str) -> str: tokenizer = ToktokTokenizer() stopword_list = nltk.corpus.stopwords.words('english') tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] filtered_tokens = [ token for token in tokens if token not in stopword_list ] filtered_text = ' '.join(filtered_tokens) return filtered_text
def establecer_tokenizador(self, tokenizador): """ Permite definir o cambiar el tokenizador a utilizar. :param tokenizador: (objeto de tokenización de NLTK). Objeto \ encargado de la tokenización de textos. Si el valor es 'None', se cargará por \ defecto una instancia de la clase *ToktokTokenizer*, de la librería NLTK. """ if tokenizador is not None: self.tokenizador = tokenizador else: self.tokenizador = ToktokTokenizer()
def create_data(lines, bptt): tokenizer = ToktokTokenizer() lines = [line.lower() for line in lines if len(line) > 40] all_text = ' \n '.join(lines) tokenized = tokenizer.tokenize(all_text) # add a + 1 since last word and first get stripped from x_text and y_text chunks = [ tokenized[i:i + bptt + 1] for i in range(0, len(tokenized), bptt + 1) ] chunks = [' '.join(chunk) for chunk in chunks] return chunks
def tokenize(string): ''' This function takes in a string and returns a tokenized string. ''' # Create tokenizer. tokenizer = ToktokTokenizer() # Use tokenizer string = tokenizer.tokenize(string, return_str=True) return string
def tokenize(string: str) -> list: """ This function accepts a string and returns a list of tokens after tokenizing to each word. """ # make tokenizer object tokenizer = ToktokTokenizer() # use tokenizer object and return string list_of_tokens = tokenizer.tokenize(string, return_str=False) return list_of_tokens
class Solver(object): def __init__(self, seed=42, ngram_range=(1, 3)): self.seed = seed self.ngram_range = ngram_range self.vectorizer = TfidfVectorizer(ngram_range=ngram_range) self.clf = LinearSVC(multi_class='ovr') self.init_seed() self.word_tokenizer = ToktokTokenizer() def init_seed(self): np.random.seed(self.seed) random.seed(self.seed) def predict(self, task): return self.predict_from_model(task) def fit(self, tasks): texts = [] classes = [] for data in tasks: for task in data: idx = int(task["id"]) text = "{} {}".format(" ".join(self.word_tokenizer.tokenize(task['text'])), task['question']['type']) texts.append(text) classes.append(idx) vectors = self.vectorizer.fit_transform(texts) classes = np.array(classes) self.classes = np.unique(classes) self.clf.fit(vectors, classes) return self def predict_from_model(self, task): texts = [] for task_ in task: text = "{} {}".format(" ".join(self.word_tokenizer.tokenize(task_['text'])), task_['question']['type']) texts.append(text) return self.clf.predict(self.vectorizer.transform(texts)) def fit_from_dir(self, dir_path): tasks = [] for file_name in os.listdir(dir_path): if file_name.endswith(".json"): data = read_config(os.path.join(dir_path, file_name)) tasks.append(data) return self.fit(tasks) @classmethod def load(cls, path): return load_pickle(path) def save(self, path): save_pickle(self, path)
def remove_stopwords(text, stopwords, is_lower_case=False): tokenizer = ToktokTokenizer() tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [token for token in tokens if token not in stopwords] else: filtered_tokens = [ token for token in tokens if token.lower() not in stopwords ] filtered_text = ' '.join(filtered_tokens) return filtered_text
def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None, lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None, **kwargs): if ngram_range is None: ngram_range = [1, 1] self.stopwords = stopwords or [] self.tokenizer = ToktokTokenizer() self.lemmatizer = pymorphy2.MorphAnalyzer() self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only self.tok2morph = {}
class RussianTokenizer(Component): """Tokenize or lemmatize a list of documents for Russian language. Default models are :class:`ToktokTokenizer` tokenizer and :mod:`pymorphy2` lemmatizer. Return a list of tokens or lemmas for a whole document. If is called onto ``List[str]``, performs detokenizing procedure. Args: stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing and ngrams creation ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method Attributes: stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing and ngrams creation tokenizer: an instance of :class:`ToktokTokenizer` tokenizer class lemmatizer: an instance of :class:`pymorphy2.MorphAnalyzer` lemmatizer class ngram_range: size of ngrams to create; only unigrams are returned by default lemmas: whether to perform lemmatizing or not lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods alphas_only: whether to filter out non-alpha tokens; is performed by default by :meth:`_filter` method tok2morph: token-to-lemma cache """ def __init__(self, stopwords: Optional[List[str]] = None, ngram_range: List[int] = None, lemmas: bool = False, lowercase: Optional[bool] = None, alphas_only: Optional[bool] = None, **kwargs): if ngram_range is None: ngram_range = [1, 1] self.stopwords = stopwords or [] self.tokenizer = ToktokTokenizer() self.lemmatizer = pymorphy2.MorphAnalyzer() self.ngram_range = tuple(ngram_range) # cast JSON array to tuple self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only self.tok2morph = {} def __call__(self, batch: Union[List[str], List[List[str]]]) -> \ Union[List[List[str]], List[str]]: """Tokenize or detokenize strings, depends on the type structure of passed arguments. Args: batch: a batch of documents to perform tokenizing/lemmatizing; or a batch of lists of tokens/lemmas to perform detokenizing Returns: a batch of lists of tokens/lemmas; or a batch of detokenized strings Raises: TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``. """ if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0]))) def _tokenize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1), lowercase: bool=True)\ -> Generator[List[str], Any, None]: """Tokenize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize` and :meth:`_lemmatize` methods Yields: list of lists of ngramized tokens or list of detokenized strings Returns: None """ # DEBUG # size = len(data) _ngram_range = self.ngram_range or ngram_range if self.lowercase is None: _lowercase = lowercase else: _lowercase = self.lowercase for i, doc in enumerate(data): # DEBUG # logger.info("Tokenize doc {} from {}".format(i, size)) tokens = self.tokenizer.tokenize(doc) if _lowercase: tokens = [t.lower() for t in tokens] filtered = self._filter(tokens) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def _lemmatize(self, data: List[str], ngram_range: Tuple[int, int]=(1, 1)) -> \ Generator[List[str], Any, None]: """Lemmatize a list of documents. Args: data: a list of documents to tokenize ngram_range: size of ngrams to create; only unigrams are returned by default Yields: list of lists of ngramized tokens or list of detokenized strings Returns: None """ # DEBUG # size = len(data) _ngram_range = self.ngram_range or ngram_range tokenized_data = list(self._tokenize(data)) for i, doc in enumerate(tokenized_data): # DEBUG # logger.info("Lemmatize doc {} from {}".format(i, size)) lemmas = [] for token in doc: try: lemma = self.tok2morph[token] except KeyError: lemma = self.lemmatizer.parse(token)[0].normal_form self.tok2morph[token] = lemma lemmas.append(lemma) filtered = self._filter(lemmas) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def _filter(self, items: List[str], alphas_only: bool=True) -> List[str]: """Filter a list of tokens/lemmas. Args: items: a list of tokens/lemmas to filter alphas_only: whether to filter out non-alpha tokens Returns: a list of filtered tokens/lemmas """ if self.alphas_only is None: _alphas_only = alphas_only else: _alphas_only = self.alphas_only if _alphas_only: filter_fn = lambda x: x.isalpha() and not x.isspace() and x not in self.stopwords else: filter_fn = lambda x: not x.isspace() and x not in self.stopwords return list(filter(filter_fn, items)) def set_stopwords(self, stopwords: List[str]) -> None: """Redefine a list of stopwords. Args: stopwords: a list of stopwords Returns: None """ self.stopwords = stopwords