def show_web_paragraphs(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) stop_words = stop.words corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) for q, d in points: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.dists(q.question, doc) if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0: continue print(" ".join(q.question)) print(q.answer.all_answers) for i, (para, dist) in enumerate(ranked[0:2]): text = flatten_iterable(para.text) print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist)) if len(para.answer_spans) == 0: continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def _preprocess_word( self, word: str, preprocessor_args: PreprocessorArgs = { 'strip_accents': False, 'lowercase': False, 'preprocessor': None, } ) -> str: """pre-processes a word before it is searched in the model's vocabulary. Parameters ---------- word : str Word to be preprocessed. preprocessor_args : PreprocessorArgs, optional Dictionary with arguments that specifies how the words will be preprocessed, by default { 'strip_accents': False, 'lowercase': False, 'preprocessor': None, } Returns ------- str The pre-processed word according to the given parameters. """ preprocessor = preprocessor_args.get('preprocessor', None) if preprocessor and callable(preprocessor): word = preprocessor(word) else: if preprocessor_args.get('lowercase', False): word = word.lower() strip_accents = preprocessor_args.get('strip_accents', False) if strip_accents == True: word = strip_accents_unicode(word) elif strip_accents == 'ascii': word = strip_accents_ascii(word) elif strip_accents == 'unicode': word = strip_accents_unicode(word) if self.vocab_prefix is not None: word = self.vocab_prefix + word return word
def preprocessor_tweet(s): tweet_p.set_options(tweet_p.OPT.EMOJI, tweet_p.OPT.URL, tweet_p.OPT.RESERVED, tweet_p.OPT.SMILEY, tweet_p.OPT.MENTION) s = re.sub(r'@petrogustavo', 'petrogustavo', s) s = re.sub(r'@sergio_fajardo', 'sergio_fajardo', s) s = re.sub(r'@IvanDuque','IvanDuque',s) s = re.sub(r'@AlvaroUribeVel','AlvaroUribeVel',s) s = re.sub(r'@JuanManSantos','JuanManSantos',s) s = re.sub(r'@German_Vargas','German_Vargas',s) s = re.sub(r'@ClaudiaLopez','ClaudiaLopez',s) s = re.sub(r'@DeLaCalleHum','DeLaCalleHum',s) s = tweet_p.clean(s) s = re.sub(r'\b(?:a*(?:ja)+h?|(?:l+o+)+l+)\b', ' ', s) s = re.sub(r'[^\w]', ' ', s) # s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s) # s = re.sub(r'#', '', s) # s = re.sub(r'¡+', '', s) # s = re.sub(r':', '', s) # s = re.sub(r'!+', '', s) # s = re.sub(r'"', '', s) # s = re.sub(r'/[-?]/', '', s) # s = re.sub(r'¿+', '', s) # s = re.sub(r'@\w+', '', s) s = strip_accents_unicode(s.lower()) s = tweet_p.clean(s) return s
def freq_weights(words, corpus='grozea', corpus_stats=None, strip=False): if strip: from sklearn.feature_extraction.text import strip_accents_unicode words = strip_accents_unicode(words) if corpus == 'grozea': corpus_stats = pd.read_csv(GROZEA, sep='[ ]', names=['freq', 'word'], index_col=1) elif corpus == 'ngrams': import bz2 corpus_stats = pd.read_csv(bz2.BZ2File(NGRAMS), sep='[\t]', names=['word', 'freq']) w = np.ones(len(words)) for k, word in enumerate(words): try: if corpus == 'ngrams': w[k] += corpus_stats['freq'][np.where( corpus_stats['word'] == word)[0]] else: w[k] += corpus_stats.lookup([word], ['freq'])[0] except: pass return w / w.sum()
def _preprocess(self, doc): if self.input == "content": pass elif self.input == "filename": with open(doc, "r", encoding=self.encoding, errors=self.decode_error) as fh: doc = fh.read() elif self.input == "file": doc = doc.read() if isinstance(doc, bytes): doc = doc.decode(self.encoding, self.decode_error) if self.strip_accents is not None: if self.strip_accents == "unicode": doc = strip_accents_unicode(doc) elif self.strip_accents == "ascii": doc = strip_accents_ascii(doc) else: raise ValueError('Invalid value for "strip_accents": %s' % self.strip_accents) if self.analyzer == "char" and self._compat_mode(): doc = self._white_spaces.sub(" ", doc) return doc
def test_strip_accents(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' expected = 'aaaaaaceeee' assert strip_accents_unicode(a) == expected a = 'ìíîïñòóôõöùúûüý' expected = 'iiiinooooouuuuy' assert strip_accents_unicode(a) == expected # check some arabic a = '\u0625' # alef with a hamza below: إ expected = '\u0627' # simple alef: ا assert strip_accents_unicode(a) == expected # mix letters accentuated and not a = "this is à test" expected = 'this is a test' assert strip_accents_unicode(a) == expected # strings that are already decomposed a = "o\u0308" # o with diaresis expected = "o" assert strip_accents_unicode(a) == expected # combining marks by themselves a = "\u0300\u0301\u0302\u0303" expected = "" assert strip_accents_unicode(a) == expected # Multiple combining marks on one character a = "o\u0308\u0304" expected = "o" assert strip_accents_unicode(a) == expected
def preprocessor(s): s = clean_html(s) s = clean_twitter(s) s = format_numbers(s) s = split_numbers(s) s = tokenize_numbers(s) s = strip_accents_unicode(s.lower()) s = tokenize_short(s) return s
def test_strip_accents(): # check some classical latin accentuated symbols a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' expected = 'aaaaaaceeee' assert_equal(strip_accents_unicode(a), expected) a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' expected = 'iiiinooooouuuuy' assert_equal(strip_accents_unicode(a), expected) # check some arabic a = '\u0625' # halef with a hamza below expected = '\u0627' # simple halef assert_equal(strip_accents_unicode(a), expected) # mix letters accentuated and not a = "this is \xe0 test" expected = 'this is a test' assert_equal(strip_accents_unicode(a), expected)
def test_strip_accents(): # check some classical latin accentuated symbols a = u'\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb' expected = u'aaaaaaceeee' assert_equal(strip_accents_unicode(a), expected) a = u'\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd' expected = u'iiiinooooouuuuy' assert_equal(strip_accents_unicode(a), expected) # check some arabic a = u'\u0625' # halef with a hamza below expected = u'\u0627' # simple halef assert_equal(strip_accents_unicode(a), expected) # mix letters accentuated and not a = u"this is \xe0 test" expected = u'this is a test' assert_equal(strip_accents_unicode(a), expected)
def test_strip_accents(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' expected = 'aaaaaaceeee' assert strip_accents_unicode(a) == expected a = 'ìíîïñòóôõöùúûüý' expected = 'iiiinooooouuuuy' assert strip_accents_unicode(a) == expected # check some arabic a = '\u0625' # alef with a hamza below: إ expected = '\u0627' # simple alef: ا assert strip_accents_unicode(a) == expected # mix letters accentuated and not a = "this is à test" expected = 'this is a test' assert strip_accents_unicode(a) == expected
def test_strip_accents(): # check some classical latin accentuated symbols a = 'àáâãäåçèéêë' expected = 'aaaaaaceeee' assert_equal(strip_accents_unicode(a), expected) a = 'ìíîïñòóôõöùúûüý' expected = 'iiiinooooouuuuy' assert_equal(strip_accents_unicode(a), expected) # check some arabic a = '\u0625' # alef with a hamza below: إ expected = '\u0627' # simple alef: ا assert_equal(strip_accents_unicode(a), expected) # mix letters accentuated and not a = "this is à test" expected = 'this is a test' assert_equal(strip_accents_unicode(a), expected)
def show_open_paragraphs(start: int, end: int): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = ShallowOpenWebRanker(6) stop_words = stop.words print("Loading train") corpus = TriviaQaOpenDataset() train = corpus.get_dev() np.random.shuffle(train) for q in train: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} para = [] for d in q.all_docs: doc = corpus.evidence.get_document(d.doc_id) para += splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, para) if len(ranked) < start: continue ranked = ranked[start:end] print(" ".join(q.question)) print(q.answer.all_answers) for i in range(start, end): para = ranked[i] text = flatten_iterable(para.text) print("Start=%d, Rank=%d" % (para.start, i)) if len(para.answer_spans) == 0: # print("No Answer!") continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def transform(self, X_df): X = np.array([ ' '.join(clean_str(text.strip_accents_unicode(dd))) for dd in X_df.statement ]) check_is_fitted(self, '_feat', 'The tfidf vector is not fitted') X = super(FeatureExtractor, self).transform(X) return X
def rule_preprocessing(corpus, stop_words=nltk.corpus.stopwords.words('portuguese'), join_tokens=False, reduce_inflection='stemming'): '''Pre processamento atraves de regras, basta inserirmos uma pd.Series na imput e as regras pre-estabelecidas melhorarao o texto''' #Removendo os espacos antes e depois da frase e substituicao de multiplos espacos corpus = [phrases.strip() for phrases in corpus] corpus = [re.sub(' +', ' ', phrases) for phrases in corpus] #coloca todas as palavras em minúsculo corpus = [phrases.lower() for phrases in corpus] #extrai pontuacoes corpus = [ phrases.translate(str.maketrans('', '', punctuation)) for phrases in corpus ] #Substituicao de entidades corpus = entities_subs(corpus) #Correcao palavras corpus = text_correction(corpus) #retira acentos corpus = [strip_accents_unicode(phrases) for phrases in corpus] #tokenizacao e remocao de stop_words corpus = tokenization(corpus, stop_words=stop_words) ###################### ESTA PARTE PRECISA MELHORAR ######################### # #Reduzir inflexoes das palavras # if(reduce_inflection == 'stemming'): # corpus = [stemming(phrases) for phrases in corpus] # elif(reduce_inflection == 'lemmatization'): # corpus = lemmatization(corpus) # else: # pass ###################### ESTA PARTE PRECISA MELHORAR ######################### #junta as palavras tokenizadas if (join_tokens == True): corpus = [' '.join(phrases) for phrases in corpus] return corpus
def get_stopwords(language, include_desc15_stopwords=True, include_custom=True, include_withoutdiacritics=True): if language in NLTK_LAN_TRANSLATOR: language = NLTK_LAN_TRANSLATOR[language] assert language in NLTK_LAN_TRANSLATOR.values( ), f"Cannot deal with language {language}" stopwords = set(nlstopwords.words(language)) if include_desc15_stopwords and language == "english": stopwords |= load_desc15_stopwords() if include_custom and language == "english": stopwords |= set(get_setting("CUSTOM_STOPWORDS")) if include_withoutdiacritics: stopwords |= set(strip_accents_unicode(i) for i in stopwords) return tuple(stopwords)
def fit(self, X_df, y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- self """ self._feat = np.array([ ' '.join(clean_str(text.strip_accents_unicode(dd))) for dd in X_df.statement ]) super(FeatureExtractor, self).fit(self._feat) return self
def clean_document(self, doc): # Remove HTML. cleaned_doc = re.sub('<[^<]+?>', ' ', doc) # Remove Unicode chars. cleaned_doc = cleaned_doc.encode('ascii', 'ignore') cleaned_doc = cleaned_doc.decode() # Remove digits and punctuation. cleaned_doc = strip_accents_unicode(cleaned_doc) \ .translate(str.maketrans(' ', ' ', string.digits)) \ .translate(str.maketrans(' ', ' ', string.punctuation)) # Remove additional unwanted chars. for unwanted_char in self._additional_unwanted_chars: if unwanted_char in cleaned_doc: cleaned_doc = cleaned_doc.replace(unwanted_char, ' ') \ return cleaned_doc
def freq_weights(words, corpus='grozea', corpus_stats=None, strip=False): if strip: from sklearn.feature_extraction.text import strip_accents_unicode words = strip_accents_unicode(words) if corpus == 'grozea': corpus_stats = pd.read_csv(GROZEA, sep='[ ]', names=['freq', 'word'], index_col=1) elif corpus == 'ngrams': import bz2 corpus_stats = pd.read_csv(bz2.BZ2File(NGRAMS), sep='[\t]', names=['word', 'freq']) w = np.ones(len(words)) for k, word in enumerate(words): try: if corpus == 'ngrams': w[k] += corpus_stats['freq'][np.where(corpus_stats['word'] == word)[0]] else: w[k] += corpus_stats.lookup([word], ['freq'])[0] except: pass return w / w.sum()
def _preprocess(self, string: str) -> str: return strip_accents_unicode(string)
def uppercase(s): return strip_accents_unicode(s).upper()
def set_items_as_tokens_preprocessor(value: Union[str, Set, List]): return [strip_accents_unicode(str(i).lower()) for i in value] \ if isinstance(value, set) or isinstance(value, list) \ else [strip_accents_unicode(str(value).lower())]
def per_word_prepro(word): return strip_accents_unicode(word.lower())
def preprocess(path): text = Classifier.file_to_text(path) text = text.lower() text = strip_accents_unicode(text) return text
def clustering_preprocessor(s): s = clean_html(s) s = clean_twitter(s) s = strip_accents_unicode(s.lower()) s = s.strip() return s
def transform(cls, string): return strip_accents_unicode(string)