def correct_alef_prev_char_normal_case_version_2(letter): overall = "" comp = "" is_corrected = False for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def teh_marbota_char_correction(char): overall = "" comp = "" is_corrected = False for c in char: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def correct_alef_maksora_prev_char_normal_case_version_2(letter): overall = "" comp = "" is_corrected = False try: for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True except: raise Exception("bug found in correct_alef_maksora_prev_char_normal_case") if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def handle(self): newData = [] for v in self.data: t = v[self.content].lower() if self.html: t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) # Tách từ t = ViTokenizer.tokenize(t) if self.accented_char: t = unicodedata2.normalize('NFD', t).encode('ascii', 'ignore').decode("utf-8") if self.special_char: t = [x.strip(SPECIAL_CHARACTER) for x in t.split()] if self.stopwords: t = [word for word in t if word not in self.list_stopword] v[self.content] = t if v not in newData: newData.append(v) print(np.array(newData))
def no_accent_vietnamese2(s): # s = s.decode('utf-8') text = re.sub(u'Đ', 'XX', s) text = re.sub(u'đ', 'XX', text) # return s.encode('utf-8') return unicodedata2.normalize('NFKD', unicodedata2(text)).encode( 'ASCII', 'ignore')
def findNumberInTag(self, tag): textToTest = [] if hasattr(tag, 'text'): textToTest.append(tag.text) elif isinstance(tag, str): textToTest.append(tag) # <a href="tel:+12312312"> +123 123 321 </a> if tag.name == 'a': textToTest.append(tag.get('href', '')) # remove unicode \x0a etc normalized = [ unicodedata2.normalize("NFKD", text) for text in textToTest ] # [['+123', '123], ['33', '222']] => ['+123', '123', '33','222'] # flatten nested list # sum method takes every element and sums to each other # ['+123', '123] + ['33', '222'] + [] => ['+123', '123', '33','222'] numbers = sum([ pattern.findall('.'.join(normalized)) for pattern in self.regexPhoneFormatPatterns ], []) return numbers
def character_list_from_string(string, normalize=True): """ Return a list of characters without space separators from an input string """ # Since Unicode allows writing the same string either precomposed or as # combining characters, we want to transform all those strings that are # written as combining characters to precomposed, if possible. In our # data a combining char (be it encoded as precomposed or with # combining marks) means we want to explicitly check # a) the combining marks, and # b) with flag we want to check the precomposed unicode is present - and # for this we need to make sure our data input with combing marks is # actually interpreted (and re-saved) as precomposed! # Before splitting a string into a list of each character (and removing # spaces) make sure any composable characters written with combining marks # are in fact transformed to precomposed characters; otherwise the # "listifying" will split base and mark(s) into several list items (chars) if normalize: # Make sure we are in fact dealing with a string, not a list if isinstance(string, list) or isinstance(string, set): string = "".join(string) # N_ormal F_orm C_omposed # See more https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize # noqa string = unicodedata2.normalize("NFC", string) li = list(string) li = list_unique([c for c in li if c.strip() != ""]) return li
def no_accent_vietnamese(s): if not s: return '' if check_han_language(s): return s text = re.sub(u'Đ', 'D', s) text = re.sub(u'đ', 'd', text) return unicodedata2.normalize('NFKD', unicodedata2(text)).encode( 'ASCII', 'ignore')
def process_latest_comments(self, stopping_id): new_stopping_id = stopping_id COMMENTS_LIMIT = 100 # intended for a cron job every 15 minutes comments = self.r_praw.subreddit('SVExchange').comments(limit=COMMENTS_LIMIT) user_tsv_set = set([]) # stores user/tsv combo to avoid duplicates user_set = set() # stores users to avoid duplicates for i, c in enumerate(comments): if i == 0: new_stopping_id = c.id link_title_ascii = unicodedata2.normalize('NFKD', c.link_title).encode('ascii', 'ignore').decode('ascii') self.stdout.write("%s %s %s" % (c.id, c.link_author.ljust(24), link_title_ascii)) if c.id <= stopping_id: from datetime import datetime self.stdout.write("new_comments [Stop] " + str(datetime.utcnow())) break op = c.link_author commenter = c.author.name if c.is_submitter and cmd_helper.is_from_tsv_thread(c.link_title): user_tsv_tuple = (op, c.link_title) tsv = int(c.link_title) ts = c.created_utc if user_tsv_tuple in user_tsv_set: self.stdout.write("\tRepeat") elif TSV.objects.check_if_exists(op, tsv): self.stdout.write("\tUpdating") new_sub_id = cmd_helper.get_id_from_full_url(c.link_url) # comment lacks gen info that's found in submission flair gen = cmd_helper.get_gen_from_comment(op, tsv, new_sub_id, self.r_praw) user_tsv = TSV.objects.get_user_tsv(op, tsv, gen) # check if submission id should be updated, in case db doesn't have user's latest thread old_sub_id = user_tsv.sub_id if new_sub_id > old_sub_id: user_tsv.sub_id = new_sub_id user_tsv.save() cmd_helper.scrape_user_tsv(user_tsv, self.r_praw, ts) else: self.stdout.write("\tAdding?") sub_id = cmd_helper.get_id_from_full_url(c.link_url) subm = self.r_praw.submission(id=sub_id) if not subm.over_18: self.stdout.write("\tAdd") gen = cmd_helper.get_gen_from_flair_class(subm.link_flair_css_class) TSV.objects.update_or_create_user_tsv(op, subm.author_flair_text, subm.author_flair_css_class, tsv, gen, sub_id, False, False, subm.created_utc, ts, None) user_tsv_set.add(user_tsv_tuple) else: if commenter not in user_set: user_set.add(commenter) tr = Trainer.objects.get_user(commenter) if tr: tr.set_activity(c.created_utc) return new_stopping_id
def correct_alef_prev_char_mem(prev_char_object): overall = "" comp = "" is_corrected = False for c in prev_char_object.letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) else: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def correct_alef_prev_char_ba2_maksora_version_2(letter): overall = "" comp = "" is_corrected = False for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) else: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def remove_accents(d): # rem unicode is default on python3 try: d = unicode(d, 'utf-8') except (TypeError, NameError): pass d = unicodedata2.normalize('NFC', d) d = d.encode('ascii', 'ignore') d = d.decode("utf-8") return str(d)
def slugify(value): """ Convert to ASCII. Convert spaces to hyphens. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace. """ value = str(value) value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') value = str(re.sub('[^\w\s-]', '', value).strip().lower()) value = str(re.sub('[-\s]+', '-', value)) return value
def save_tweets(): print 'Saving Tweets' # print tweets file_tweets = open('../data_set/new_tweets.txt', 'a+') lastline = file_tweets.readlines()[-1].split(',') line_number = int(lastline[0]) print 'Last line', lastline print 'Last line number', line_number for tweet in tweets: line_number += 1 data = unicodedata.normalize('NFKD', tweet.text).encode('utf-8', 'ignore') file_tweets.write(str(line_number) + ', ,') file_tweets.write(json.dumps(data)) file_tweets.write('\n') file_tweets.close()
def on_data(self, data): try: all_data = json.loads(data) #print(data) # filename = '/tmp/myfile%s.txt'%datetime.utcnow().strftime('%Y%m%d%H%M%S%f')[:-3] # f = open(filename,'w') # f.write(data) # f.close() #hdfs.put(filename,"hdfs://r01mstr.bddata.local:9000/user/vijay/tweets/") #call(["hadoop fs" "-put /user/saurzcode/dir3/"]) # cmd = 'hadoop fs -put %s /user/vijay/tweets/'%filename # os.system(cmd) #print cmd # print(data) #tweet = all_data["text"].encode('utf-8') tweet = unicodedata2.normalize('NFKD', u'' + all_data["text"]).encode( 'ascii', 'ignore') created_at = all_data["created_at"].encode('utf-8') username = all_data["user"]["screen_name"].encode('utf-8') sentiment = self.get_tweet_sentiment(tweet) image_url = all_data["user"]["profile_image_url"].encode( 'utf-8') rt_user = "******" rt_user = all_data["retweeted_status"]["user"][ "screen_name"].encode('utf-8') rt_user_image_url = all_data["retweeted_status"]["user"][ "profile_image_url"].encode('utf-8') #print(tweet) print(rt_user) print(rt_user_image_url) print(sentiment) print("\n") #print(image_url) #print("\n") topics = str(sys.argv[1]) c.execute( "INSERT INTO tweets (created_at, screen_name, text,sentiment,profile_image_url,topic,rt_user,rt_user_image_url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (created_at, username, tweet, sentiment, image_url, topics, rt_user, rt_user_image_url)) conn.commit() return True except: pass
def clean_line(line): table = str.maketrans('', '', string.punctuation) re_print = re.compile('[^%s]' % re.escape(string.printable)) line = normalize('NFD', line).encode('ascii', 'ignore') line = line.decode('UTF-8') # tokenize on white space line = line.split() # convert to lowercase line = [word.lower() for word in line] # remove punctuation from each token line = [word.translate(table) for word in line] # remove non-printable chars form each token line = [re_print.sub('', w) for w in line] # remove tokens with numbers in them line = [word for word in line if word.isalpha()] # store as string return ' '.join(line)
def main(): try: malformed_string = sys.argv[1] except IndexError: malformed_string = 'Please put something...' try: form = sys.argv[2] except IndexError: form = 'NFC' formed_string = unicodedata2.normalize(form, malformed_string) dmp = dmp_module.diff_match_patch() diffs = dmp.diff_main(malformed_string, formed_string) with open('templates/index.html') as inf: html = inf.read() soup = BeautifulSoup(html, 'html.parser') div_output = soup.find('div', {'id': 'output'}) div_output.clear() new_pre = "<pre class='diff_data'>\n----------------------------------------\nForm: " + form + \ "\nMalformed string: " + malformed_string + \ "\n========================================\n " + form + \ " string: " + formed_string + "\n========================================\n" + \ "\nDiff: " + ''.join(map(str, diffs)) + "</pre>" extra_soup = BeautifulSoup(new_pre, 'html.parser') div_output.append(extra_soup) pretty_diff = dmp.diff_prettyHtml(diffs) extra_soup = BeautifulSoup( ("<div class='diff_wrapper'>" + pretty_diff + "</div>"), 'html.parser') div_output.append(extra_soup) with open('templates/index.html', 'w') as outf: outf.write(str(soup)) threading.Timer(1.25, lambda: webbrowser.open("http://127.0.0.1:5000/")).start() subprocess.call("FLASK_APP=routes.py flask run", shell=True)
def handle(self): t = self.text.lower() t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) # Tách từ t = ViTokenizer.tokenize(t) # Xóa dấu t = unicodedata2.normalize('NFD', t).encode('ascii', 'ignore').decode("utf-8") t = [x.strip(SPECIAL_CHARACTER) for x in t.split()] t = [word for word in t if word not in self.stopwords] self.text = t
def slugify(value): """ Normalizes string, converts to lowercase, removes non-alpha characters, and converts spaces to hyphens. """ import unicodedata2 import re symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", u"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA") tr = {ord(a): ord(b) for a, b in zip(*symbols)} # for Python 2.*: # tr = dict( [ (ord(a), ord(b)) for (a, b) in zip(*symbols) ] ) value = value.translate(tr) # looks good value = unicodedata2.normalize('NFKD', value).encode('ascii', 'ignore').decode('utf-8').strip() # value = re.sub('[^\w\s-]', '', value).strip().lower() # value = re.sub('[-\s]+', '-', value) # ... return value
def handle(self): label = self.labelCol content = self.contentCol newData = [] # print(self.data) for v in self.data: t = v[content].lower() if self.html_stripping: t = BeautifulSoup(t, 'html.parser').get_text() # Chuẩn hóa láy âm tiết t = re.sub(r'(\D)\1+', r'\1', t) # Tách từ t = ViTokenizer.tokenize(t) if self.remove_accented_chars: t = unicodedata2.normalize('NFD', t).encode('ascii', 'ignore').decode("utf-8") if self.remove_special_characters: t = [x.strip(SPECIAL_CHARACTER) for x in t.split()] t = [i for i in t if i] if self.remove_stopwords: stopwords = self.stopwords t = [word for word in t if word not in self.stopwords] v[content] = t if v not in newData: newData.append(v) print(np.array(newData))
def fatha_correction(list_of_objects_of_chars_and_its_location): counter = 0 current_index = 0 actual_letters_after_fatha_correction = [] prev_char_object = WordLetterProcessingHelperMethod.LetterPosition() prev_prev_char_object = WordLetterProcessingHelperMethod.LetterPosition() next_char_object = WordLetterProcessingHelperMethod.LetterPosition() for each_letter_object in list_of_objects_of_chars_and_its_location: actual_letters_after_fatha_correction.append(each_letter_object) character = remove_diacritics(each_letter_object.letter) if (character in letters_of_fatha_correction) and (each_letter_object.location != 'first'): letter_caused_fatha_correction = character if (counter - 1) >= 0: prev_char_object = list_of_objects_of_chars_and_its_location[counter - 1] prev_char_object.letter = unicodedata2.normalize('NFC', str(prev_char_object.letter)) if (counter - 2) >= 0: prev_prev_char_object = list_of_objects_of_chars_and_its_location[counter - 2] prev_prev_char_object.letter = unicodedata2.normalize('NFC', prev_prev_char_object.letter) if ((counter + 1) <= (len(list_of_objects_of_chars_and_its_location) - 1)) and (each_letter_object.location != 'last'): next_char_object = list_of_objects_of_chars_and_its_location[counter + 1] corrected_char = prev_char_object.letter if letter_caused_fatha_correction == u'ة': corrected_char = correct_teh_marbota_prev_char(prev_char_object) elif letter_caused_fatha_correction == u'ا': if each_letter_object.location == 'middle': if remove_diacritics(prev_char_object.letter) == u'ب': # , بِاتِّخَاذِكُمُ ,وَبِالْآخِرَةِ , بِالْعُدْوَةِ if u'ّ' in next_char_object.letter or\ next_char_object.letter == remove_diacritics(next_char_object.letter): corrected_char = correct_alef_prev_char_ba2_maksora(prev_char_object) # بَالِغَةٌ , بَاسِرَةٌ else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif remove_diacritics(prev_char_object.letter) == u'ل': if prev_char_object.location == 'first': # do not handle this case # special case with no law (these are contradict) لَا , لِامْرَأَتِهِ corrected_char = prev_char_object.letter elif prev_prev_char_object.letter == u'ا': # do not handle this case # special case with no law (these are contradict) الِاسْمُ corrected_char = prev_char_object.letter else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) # مِائَةَ , مِائَتَيْنِ elif remove_diacritics(prev_char_object.letter) == u'م' \ and prev_char_object.location == 'first' \ and next_char_object.letter == u'ئَ': corrected_char = correct_alef_prev_char_mem(prev_char_object) else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif each_letter_object.location == 'last' or each_letter_object.location == 'first': corrected_char = prev_char_object.letter else: corrected_char = correct_alef_prev_char_normal_case(prev_char_object) elif letter_caused_fatha_correction == u'ى': # طُوًى, ضُحًى if prev_prev_char_object.location == 'first' and u'ُ' in prev_prev_char_object.letter and \ each_letter_object.location == 'last': corrected_char = correct_alef_maksora_prev_char_tanween_case(prev_char_object) # أَبَى else: corrected_char = correct_alef_maksora_prev_char_normal_case(prev_char_object) actual_letters_after_fatha_correction[counter - 1].letter = corrected_char counter += 1 else: counter += 1 current_index += 1 return actual_letters_after_fatha_correction
def len_unicode(ustr): return len(unicodedata2.normalize('NFC', ustr.decode('utf-8')))
def strip_accents(self, text): text = unicodedata2.normalize('NFD', text) text = text.encode('ascii', 'ignore') text = text.decode("utf-8") return str(text)
def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )
def remove_diacritics(character): nkfd_form = unicodedata2.normalize('NFKD', str(character)) char = u"".join([c for c in nkfd_form if not unicodedata2.combining(c) or c == u'ٓ' or c == u'ٔ' or c == u'ٕ']) return char
def get_stats_from_chars(text_chars, db=None): report = {} uppercase = [] numerals = [] punctuation = [] controlchars = [] spaces = [] other = [] # Include decomposed forms for c in text_chars: decomposed = ud.normalize("NFKD", c) if len(decomposed) > 1: text_chars = text_chars + [d for d in decomposed] text_chars = set(text_chars) for c in text_chars: # print(c, ud.category(c)) cat = ud.category(c) if cat == "Lu": uppercase.append(c) elif cat.startswith("N"): numerals.append(c) elif cat.startswith("P"): punctuation.append(c) elif cat.startswith("C") and len(c) > 1: controlchars.append(c) elif cat.startswith("Z"): spaces.append(c) else: other.append(c) # Remove all but "other" from chars, we don't care about them for diffing for remove in [ uppercase, numerals, punctuation, controlchars, spaces, ["\n", "\t"] ]: text_chars = text_chars.difference(set(remove)) report["iso_in_db"] = db is not None report["found_in_text"] = { "uppercase": sorted(uppercase), "numerals": sorted(numerals), "punctuation": sorted(punctuation), "chars": sorted(text_chars) } # Compare to orthographies if db is not None: db_chars = [] if "orthographies" in db: for o in db["orthographies"]: if "base" in o: db_chars = db_chars + o["base"] if "auxiliary" in o: db_chars = db_chars + o["auxiliary"] db_chars = set(sorted(db_chars)) not_in_db = text_chars.difference(db_chars) missing_from_text = db_chars.difference(text_chars) decomposed = set(parse_chars("".join(text_chars), decompose=True)) # print("Listed in DB but not in text", missing_from_text) # print("Appears in text but not listed in DB", not_in_db) # print("Text can be written with DB characters", # decomposed.issubset(db_chars)) missing_from_db = "" for c in not_in_db: missing = ud.normalize("NFKD", c) missing_parts = "" for part in missing: if part not in db_chars: missing_parts = missing_parts + part if missing_parts != []: missing_from_db = missing_from_db + missing_parts # print("missing from db", sorted(list(missing_from_db))) missing_from_db = sorted(list(set(missing_from_db))) report["not_in_text"] = sorted(missing_from_text) report["not_in_db"] = sorted(not_in_db) if missing_from_db: report["missing_from_db"] = missing_from_db report["db_chars_valid"] = decomposed.issubset(db_chars) return report
def test_issue_10254_unicodedata2(self): text = 'Li̍t-sṳ́' self.assertEqual(text, unicodedata2.normalize('NFC', text))
def test_issue_10254_unicodedata2(self): """Test Python issue #10254 is avoided with unicodedata2 package.""" text = 'Li̍t-sṳ́' self.assertEqual(text, unicodedata2.normalize('NFC', text))