def murdesonastik_task(self, word): """Task that fetches MS""" try: html = sessions["murdesõnastik"].get( "http://www.eki.ee/dict/ems/index.cgi?F=K&Q=" + word).content except exceptions.ConnectionError: self.update_state(state=states.FAILURE, meta="Connection failure") raise Ignore() soup = BeautifulSoup(html, "html.parser") amount = soup.find_all("p", {"class": "inf"})[0].get_text() if "Päring ei andnud tulemusi!" in amount: amount = 0 return {"progress": 100, "count": amount, "result": []} else: amount = amount.split(" ")[1] results = soup.find_all("div", {"class": "tervikart"}) clean_results = [] for result in results: if deaccent(str(word)) in deaccent(str(result)): clean_results.append( highlight_word_in_html(remove_tags_and_beautify(result), word)) clean_results = clean_results[:6] if len(clean_results) == 0: amount = 0 return {"progress": 100, "count": amount, "result": []} return {"progress": 100, "count": amount, "result": clean_results}
def process_sentence(self, s, exclude_punct=False): st = [] split_indices = [] for i, tok in enumerate(s): if exclude_punct and tok in self.PUNCT: continue elif self.is_number(tok): try: if s[i-1] == "(" and s[i+1] == ")" or s[i-1] == "〈" and s[i+1] == "〉": pass else: tok = "<nUm>" except: tok = "<nUm>" # replace all numbers with a string <nUm> else: elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match(tok) if elem_with_valence is not None: # change element name to symbol elem_mention = elem_with_valence.group(1) try: formula = self.elem_name_dict[elem_mention.lower()] matmention = elem_mention.lower() except: formula = elem_mention # this was already the symbol matmention = elem_mention self.mat_list.append((matmention, formula)) # exclude the valence state from name # split this for word2vec st.append(matmention) split_indices.append(i) tok = elem_with_valence.group(2) elif tok in self.ELEMENTS_AND_NAMES: # add element names to formulae try: formula = self.elem_name_dict[tok.lower()] matmention = tok.lower() tok = matmention except: formula = tok # this was already the symbol matmention = tok self.mat_list.append((matmention, formula)) elif self.is_simple_formula(tok): formula = self.get_norm_formula(tok) self.mat_list.append((tok, formula)) tok = formula elif (len(tok) == 1 or (len(tok) > 1 and tok[0].isupper() and tok[1:].islower())) \ and tok not in self.ELEMENTS and tok not in self.UNITS \ and self.ELEMENT_DIRECTION_IN_PAR.match(tok) is None: # to lowercase if only first letter is uppercase (chemical elements already covered above) tok = deaccent(tok.lower()) else: # splitting units from numbers (e.g. you can get 2mol., 3V, etc..) nr_unit = self.NR_AND_UNIT.match(tok) if nr_unit is None or nr_unit.group(2) not in self.UNITS: tok = deaccent(tok) # matches the pattern but not in the list of units else: # splitting the unit from number st.append("<nUm>") split_indices.append(i) tok = nr_unit.group(2) # the unit st.append(tok) return st, split_indices
def get_hyperonyms(main_word): HYPONYM = eq(utils.deaccent(main_word)) RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END), rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM)) parser = Parser(RULE) text = utils.deaccent(wikipedia.summary(main_word)) print(text) text = re.sub(r'\(.+?\)', '', text) text = text.lower().replace('* сергии радонежскии* ', '') for idx, match in enumerate(parser.findall(text.lower())): k = [_.value for _ in match.tokens] print(k)
def pre_process(s): s = str(s) s = strip_tags(s) s = deaccent(s) s = strip_multiple_whitespaces(s) s = s.lower() return s
def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False): """ Iteratively yield tokens as unicode strings, removing accent marks and optionally lowercasing the unidoce string by assigning True to one of the parameters, lowercase, to_lower, or lower. Input text may be either unicode or utf8-encoded byte string. The tokens on output are maximal contiguous sequences of alphabetic characters (no digits!). >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True)) [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] """ lowercase = lowercase or to_lower or lower text = to_unicode(text, encoding, errors=errors) if lowercase: text = text.lower() if deacc: text = deaccent(text) return simple_tokenize(text)
def preprocess_text(tweet): """ Function to process an aggregated user profile. This does the following: 1. Decode html entities. eg. "AT&T" will become "AT&T" 2. Deaccent 3. Remove links. 4. Remove any user mentions (@name). 5. Lemmatize and remove stopwords. Parameters: ---------- text : String. If train_texts is a list of tweets, ' '.join and pass Returns: ------- text : preprocessed (tokenized) tweet. """ tweet = decode_htmlentities(tweet) tweet = deaccent(tweet) tweet = tweet.encode('ascii', 'ignore') # To prevent UnicodeDecodeErrors later on tweet = re.sub(r'http\S+', '', str(tweet)) # Step 3 tweet = re.sub(r'@\w+', '', str(tweet)) # Step 4 tweet = tweet.split() tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stopwords.words('english'), min_length=3, max_length=15) tweet = [word.split('/')[0] for word in tweet] return tweet
def token_gen(self, lines_str, lower=False, remove_accents=True): lines = deaccent(lines_str.strip()) if remove_accents else lines_str lines = lines.lower() if lower else lines match = self.pattern.search(lines) if match is None: return iter(self.empty) # return empty iterator if no tokens last_token = None ends_at = 0 while match is not None: starts_at = match.start() tailspace = ends_at != starts_at # leading space for the current word ends_at = match.end() if last_token: # yield last_token, self.tailing_space if leading_space else self.no_tailing_space # yield last_token, leading_space # yield (last_token, leading_space) if mark_tailing_spaces else last_token yield Token(text=last_token, tailspace=tailspace) last_token = lines[starts_at:ends_at] match = self.pattern.search(lines, ends_at) # yield last_token, self.no_tailing_space # yield last_token, False # yield (last_token, False) if mark_tailing_spaces else last_token yield Token(text=last_token, tailspace=False)
def extract_names(txt, nlp, n_sentences=2): """ Use the spacy entity engine to extract person names from a text args: - txt: raw text - nlp: a spacy engine return: - list of names as strings """ # to unicode & get rid of accent txt = deaccent(any2unicode(txt)) # split according to reply forward (get rid of "entête") txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") # split sentences sentences = sent_tokenize(txt) # tokenize + lemmatize + filter ? bow = [] for sent in sentences[:n_sentences]: if REGEX: sent = " ".join(lower_upper_pat.split(sent)) sent = " ".join(number_letter_pat.split(sent)) doc = nlp(sent, parse=False) for tok in doc: lemma = drop_digits(replace_punct(tok.lemma_)) if (lemma and (tok.ent_type_ != 'PERSON') and not tok.is_punct and not tok.is_stop and lemma not in extendedstopwords and not tok.like_num and not tok.is_space and not tok.like_url and len(lemma) > 1 and not any( (x in tok.orth_ for x in not_in_list))): bow.append(lemma) return bow
def tokenize(text, deacc=False, encoding='utf8', lowercase=False, to_lower=False): """ Iteratively yield re-based tokens as unicode strings, removing accent marks and optionally lowercasing :param text: Input text :type text: str :param deacc: Remove accentuation :type deacc: bool :param encoding: Encoding of text :type encoding: str :param lowercase: To lowercase :type lowercase: bool :param to_lower: To lowercase :type to_lower: bool :return: Contiguous sequences of alphabetic characters (no digits!) :rtype: str # Example: # list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True)) # [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu'] """ lowercase = lowercase or to_lower from gensim.utils import to_unicode text = to_unicode(text, encoding, errors='ignore') if lowercase: text = text.lower() if deacc: from gensim.utils import deaccent # Example # -------- # >>> from gensim.utils import deaccent # >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek") # u'Sef chomutovskych komunistu dostal postou bily prasek' text = deaccent(text) return re_tokenize(text)
def review_to_wordlist( review_text, remove_stopwords=False ): # Function to convert a document to a sequence of words, # optionally removing stop words. Returns a list of words. # # Removes any accents review_text = utils.deaccent(review_text) # Replace hypens with spaces review_text = re.sub(r"-", " ", review_text) # Remove non-letters review_text = re.sub("[^a-zA-Z!?0-9]"," ", review_text) review_text = re.sub("[!]", " !", review_text) review_text = re.sub("[?]", " ?", review_text) # Removes email addresses review_text = re.sub(r"[\w]+@[\.\w]+", "", review_text) # Removes web addresses review_text = re.sub(r"/[a-zA-Z]*[:\/\/]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", review_text) # Convert words to lower case and split them words = review_text.lower().split() # Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] #Implement porter stemmer stemmer = PorterStemmer() words = [stemmer.stem(w) for w in words] # Return a list of words return(words)
def custom_tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False, cde=True): text = to_unicode(text, encoding, errors=errors) lowercase = lowercase or to_lower or lower if lowercase: text = text.lower() if deacc: text = deaccent(text) if cde: text = " ".join(text.split()) cde_p = Paragraph(text) tokens = cde_p.tokens toks = [] for sentence in tokens: toks.append([]) for tok in sentence: if tok.text not in string.punctuation: yield tok.text else: for match in PAT_ALPHABETIC.finditer(text): yield match.group()
def bow_mail_body(txt, nlp): """ args: - txt: raw text - nlp: a spacy engine """ # to unicode & get rid of accent txt = deaccent(any2unicode(txt)) # split according to reply forward (get rid of "entête") txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") # split sentences sentences = sent_tokenize(txt) # tokenize + lemmatize + filter ? bow = [] for sent in sentences: if REGEX: sent = " ".join(lower_upper_pat.split(sent)) sent = " ".join(number_letter_pat.split(sent)) doc = nlp(sent, parse=False, entity=False) for tok in doc: if (tok.lemma_ and not tok.is_punct and not tok.is_stop and not tok.like_num and not tok.is_space and not tok.like_url and len(tok) > 1 and not any( (x in tok.orth_ for x in not_in_list))): if tok.orth_.startswith("-") or tok.orth_.endswith("-"): bow.append(tok.lemma_.replace("-", "")) else: bow.append(tok.lemma_) return bow
def obter_link_name(nome): ''' Deixa o nome passado em minúsculas, sem acentos e com '_' entre as palavras Parâmetros: nome (String) --> Nome a ser transformado Retorno: O nome em minúsculas, sem acentos e com '_' entre as palavras (String) ''' return g_utils.deaccent(RE_ESPACO.sub('_', nome.lower()))
def token_gen(self, lines_str, lower=False, remove_accents=True): lines = deaccent(lines_str.strip()) if remove_accents else lines_str lines = lines.lower() if lower else lines tokens = self.bpe_tokenizer.EncodeAsPieces(re.sub(r"\d", "0", lines)) for token in tokens: yield Token(text=token, tailspace=False)
def tokenize_(string,_additional_stop_words=[]): text = "".join([w for w in string if w not in non_words]) text = deaccent(text).split(' ') text = map(lambda x: x.lower().strip(), text) text = filter(lambda x: x not in stopwords and x not in _additional_stop_words, text) text = [*filter(lambda x: len(x) > 1, text)] if text == None: return [] return text
def tokenize(text): text = to_unicode(text, encoding='utf8', errors='ignore') text = text.lower() # normalize unicode (i.e., remove accentuation) text = deaccent(text) for token, pos in pos_tag(word_tokenize(text)): # only Noun is acceptable: https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/ if token in ['NN', 'NNS', 'NNP', 'NNPS']: yield token
def clean_medical_documents(docs): r""" Clean medical reports. This function makes a simple pre-processing of medical texts. The steps are: 1. String is deaccented. 2. Sequences of at least 2 letters are extracted (numbers and other characters are ignored). 3. Tokens with all uppercase letters are kept. Other tokens are converted to lowercase. 4. The tokens 'pt', and 'pts' are replaced by 'patient'. Parameters ---------- docs : list `list` of documents. Each document is a `str`. Returns ------- docs : list `list` of documents. Each document is a `list` of lines. Each line is a `list` of tokens. Example ------- >>> from support import clean_medical_documents >>> docs = [ ... 'The pt. appears awake.', ... 'Cardiologist: Dr. C. Núttèr', ... ] >>> clean_docs = clean_medical_documents(docs) >>> print(clean_docs) ['the patient appears awake', 'cardiologist dr nutter'] >>> """ # to not modify the input parameter docs = list(docs) regex = re.compile(pattern=r'[a-zA-Z]{2,}') for i, doc in enumerate(docs): doc = deaccent(doc) # keep uppercase words tokens = [ token.lower() if not token.isupper() else token for token in regex.findall(doc) ] # replace tokens for j, token in enumerate(tokens): if token in ('pt', 'pts'): tokens[j] = 'patient' # clean document docs[i] = ' '.join(tokens) return docs
def extractSalaryFromFile(number, file): groups = [] with open(file, 'r') as f: for line in f: groups.append(remove_stopwords(deaccent(line))) texts = [[text for text in group.split()] for group in groups] words = [] for g in texts: for w in g: words.append(w) s = dumbFindSalary(words) print("From text number " + str(number) + ", " + s)
def preprocess(s, stem=True): ''' given a document or query string, returns a list of preprocessed words. we can decide whether to stem each word or not. ''' if not stem: preprocess_filters = DEFAULT_FILTERS.copy() preprocess_filters.pop() # remove stemming from list of filters wordList = preprocess_string(s, filters=preprocess_filters) else: wordList = preprocess_string(s) for i in range(len(wordList)): wordList[i] = deaccent(wordList[i]) return wordList
def preprocess_txt(raw_txt): """ Preprocessing of raw txt before parsing with Spacy - deaccent, to unicode - split forward, redirect - replace the > of email reply - split lowerUpper - split letterNumber """ txt = deaccent(any2unicode(raw_txt)) txt = "\n".join(re_fw_regex.split(txt)) txt = txt.replace(">", " ") txt = " ".join(lower_upper_pat.split(txt)) txt = " ".join(number_letter_pat.split(txt)) return txt
def remove_non_plain(document): """ Replaces urls, @usernames, #tags, emojis and numbers with a ' ' (space). Also removes accents and punctuation to finally remove redundant whitespace and lowercase all characters :param document: string :return: processed unicode string """ document = to_unicode(document) document = non_plain_re.sub(' ', document) document = proc.strip_non_alphanum(document) document = proc.strip_numeric(document) document = proc.strip_multiple_whitespaces(document) document = deaccent(document) return document.lower()
def to_ascii(string): """ Replace all non-ascii chars with ascii-equivalent, remove all non-printing characters,replace all tabs with 4 spaces. Returns: A transformed string """ tabs = re.compile('\t') newstring, _ = tabs.subn(' ' * 4, string) car_return_etc = re.compile('\r|\x0b|\x0c') newstring, _ = tabs.subn('\n', newstring) newstring = deaccent(newstring) #FIXME removes newlines, not intended behavior nonprintable = re.compile('[^ -~\n]') newstring, _ = nonprintable.subn('', newstring) return newstring.encode('ascii')
def get_tokens(text): text = to_unicode(text, encoding='utf8', errors='ignore') text = text.lower() # normalize unicode (i.e., remove accentuation) text = deaccent(text) bi = [] for match in PAT_ALPHABETIC.finditer(text): uni = match.group() yield uni bi.append(uni) if len(bi) == 1: continue yield ' '.join(bi) del bi[0]
def regularize(text): '''return the regularized text''' r1 = u'^RT @.*?: |@+[^\s]*|^RT\s' # exclude RT @... \n # http(s) r2 = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' r3 = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$' # e-mail r4 = '\s+' # multiple empty chars r5 = 'http[s]?:.*? ' # r6 = "[^A-Za-z0-9_]" # not alphabet number and _ r7 = '\*.+?\*' sub_rule = r1 + '|' + r2 + '|' + r3 + '|' + r5 + '|' + r7 text = html.unescape(text) text = deaccent(text) text = re.sub(sub_rule, " ", text) text = emoji.demojize(text, delimiters=('emo_', ' ')) # text = re.sub(r6, ' ', text) text = re.sub(r4, ' ', text) return text.lower()
def get_soups(links, name): ''' This function iterates over all search pages, converts them into a BeautifulSoup object and stores them in a JSON file as outside of this script. The keys of the dictoniary distinguish here between the different objects/HTML-pages. ''' count = 0 dict_ = {} soups = [] for link in tqdm(links): sleep(random.uniform(0.5, 2)) request = requests.get(link) request.encoding='UTF-8' soups.append(BeautifulSoup(request.text,'lxml')) for soup in soups: dict_[count] = str(deaccent(soup).encode("utf-8")) count += 1 with open(name, 'w') as write_file: json.dump(dict_, write_file, indent = 4)
def cleaning(string): string = ' '.join( [make_cleaning(w, normalized_chars) for w in string.split()]) string = re.sub('\(dot\)', '.', string) string = deaccent(string) # remove href string = (re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string) if (len(re.findall(r'\<a (.*?)\>', string)) > 0) and ('href' in re.findall(r'\<a (.*?)\>', string)[0]) else string) string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string) string = re.sub(r'http\S+|www.\S+|bit.\S+', '', string) string = re.sub(r'[ ]+', ' ', string).strip().split() string = [w for w in string if w[0] != '@'] return ' '.join(string)
def pre_process_name(self, name): """ Takes a string as input, removes accents and converts to lowercase """ if type(name) == str and len(name) > 0: name = deaccent(name) name = name.lower() first_name = name[0] if '+' in name: last_name = name[name.rfind('+') + 1:] else: last_name = name[1:] first_name = first_name.replace('.', '').replace('-', '').replace( '\'', '').replace(' ', '') first_init = first_name[0] if len(first_name) > 0 else '' last_name = last_name.replace('.', '').replace('-', '').replace( '\'', '').replace(' ', '') name = (first_init, last_name) return name
def cleanEmailText(text): # Removes any accents text = utils.deaccent(text) # Replace hypens with spaces text = re.sub(r"-", " ", text) # Removes dates text = re.sub(r"\d+/\d+/\d+", "", text) # Removes times text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # Removes email addresses text = re.sub(r"[\w]+@[\.\w]+", "", text) # Removes web addresses text = re.sub(r"/[a-zA-Z]*[:\/\/]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # Remove any bad characters clndoc = '' for eachLetter in text: if eachLetter.isalpha() or eachLetter == ' ': clndoc += eachLetter text = ' '.join(clndoc.split()) return text
def preprocessing_tweet_text(tweet_text) -> List[str]: """ Neural Language Model like ELMo does not need much normalisation. Pre-trained ELMo model only need pre-tokenised text. :param tweet_text: :return: """ if not isinstance(tweet_text, str): raise ValueError("Text parameter must be a Unicode object (str)!") norm_tweet = tweet_text.lower() # remove retweets # norm_tweet = re.sub('rt @?[a-zA-Z0-9_]+:?', '', norm_tweet) norm_tweet = re.sub(r'^(rt)( @\w*)?[: ]', '', norm_tweet) # remove URL norm_tweet = re.sub(r"http\S+", "", norm_tweet) # remove pic URL norm_tweet = re.sub(r"pic.twitter.com\S+", "", norm_tweet) # remove user mentions norm_tweet = re.sub(r"(?:\@|https?\://)\S+", "", norm_tweet) # remove punctuations: # norm_tweet = re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', repl='', string=norm_tweet).strip() # deaccent norm_tweet = deaccent(norm_tweet) tknzr = TweetTokenizer() tokenised_norm_tweet = tknzr.tokenize(norm_tweet) # https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/ # Set the minimum number of tokens to be considered if len(tokenised_norm_tweet) < 4: return [] num_unique_terms = len(set(tokenised_norm_tweet)) # Set the minimum unique number of tokens to be considered (optional) if num_unique_terms < 2: return [] return tokenised_norm_tweet
def to_normalize(data): if verbose: print('#' * 10, 'Step - Normalize chars and dots:') normalized_chars = {} chars = '‒–―‐—━—-▬' for char in chars: normalized_chars[ord(char)] = '-' chars = '«»“”¨"' for char in chars: normalized_chars[ord(char)] = '"' chars = "’'ʻˈ´`′‘’\x92" for char in chars: normalized_chars[ord(char)] = "'" chars = '̲_' for char in chars: normalized_chars[ord(char)] = '_' chars = '\xad\x7f' for char in chars: normalized_chars[ord(char)] = '' chars = '\n\r\t\u200b\x96' for char in chars: normalized_chars[ord(char)] = ' ' # Normalize chars and dots - SEE HELPER FOR DETAILS # Global data = list( map( lambda x: ' '.join( [_make_cleaning(i, normalized_chars) for i in x.split()]), data)) data = list(map(lambda x: re.sub('\(dot\)', '.', x), data)) data = list(map(lambda x: deaccent(x), data)) return data
def clean_string(string): # Empty strings if not string or string == 'N': return None string = deaccent(string).lower() # Remove quote text string = re.sub(re_reply_to, '', string) string = re.sub(re_quote_line, '', string) string = re.sub(re_youtube_link, ' YOUTUBELINK ', string) string = re.sub(re_link, ' WEBLINK ', string) string = re.sub(re_pol_board, ' pol ', string) string = re.sub(re_b_board, ' RANDOMBOARD ', string) string = re.sub(re_chan_board, ' CHANBOARD ', string) string = strip_punctuation(string) # Punctuation to remove completely # string = re.sub(re_punc_to_none, '', string) # Substitute in this order # string = re.sub(re_ellipsis, ' <ELLIPSIS> ', string) # string = re.sub(re_echoes, ' <ECHOES> ', string) # string = re.sub(re_pol_board, ' <POLBOARD> ', string) # string = re.sub(re_numbers, ' <NUMBER> ', string) # string = re.sub(re_period, ' <PERIOD> ', string) # string = re.sub(re_question, ' <QUESTION> ', string) # Replace all other punc to spaces and remove whitespace in between # string = re.sub(re_punc_to_space, ' ', string) string = ' '.join([word for word in [w.strip() for w in string.split()]]) return string if string else None
def article_to_bow(article): tokens = [deaccent(tok).lower() for tok in list(itertools.chain(*article)) if tok not in stopwords.words('norwegian') and tok.isalpha()] return tokens