def strip_unicode(text): clean = ftfy.fix_text(ftfy.fix_encoding(text)) temp = "" try: temp = clean.decode('utf-8', 'ignore').encode('utf-8') except UnicodeEncodeError or UnicodeEncodeError as e: print "error1" + str(e) try: temp = text.encode('utf-8').decode('utf-8', 'ignore').encode('utf-8') except UnicodeEncodeError or UnicodeDecodeError as e: print "error2" + str(e) return temp
def strip_unicode(text): clean = ftfy.fix_text(ftfy.fix_encoding(text)) try: temp = clean.decode('utf-8', 'ignore').encode('utf-8') except UnicodeEncodeError or UnicodeEncodeError as e: # print "ERROR ONE" pass try: # print "first encoding failed, trying other way!" temp = text.encode('utf-8').decode('utf-8', 'ignore').encode('utf-8') except UnicodeEncodeError or UnicodeDecodeError as e: # print "ERROR TWO" pass return temp
def fetch(): all_prices, timestamps = {}, [] for i, url in enumerate(configuration.get('cardhoarder_urls')): s = fetcher_internal.fetch(url) s = ftfy.fix_encoding(s) timestamps.append(dtutil.parse_to_ts(s.split('\n', 1)[0].replace('UPDATED ', ''), '%Y-%m-%dT%H:%M:%S+00:00', dtutil.CARDHOARDER_TZ)) all_prices[i] = parse_cardhoarder_prices(s) url = configuration.get('mtgotraders_url') if url: s = fetcher_internal.fetch(configuration.get('mtgotraders_url')) timestamps.append(dtutil.dt2ts(dtutil.now())) all_prices['mtgotraders'] = parse_mtgotraders_prices(s) if not timestamps: raise TooFewItemsException('Did not get any prices when fetching {urls} ({all_prices})'.format(urls=configuration.get('cardhoarder_urls') + [configuration.get('mtgotraders_url')], all_prices=all_prices)) store(min(timestamps), all_prices)
def tokenizer_word(data_dir, category, save_dir, sw_dir): list_fn = os.listdir(os.path.join(data_dir, category)) file_train = os.path.join(save_dir, "Train", "{}.txt".format(category)) file_test = os.path.join(save_dir, "Test", "{}.txt".format(category)) # lay stopwords tu file with open(sw_dir, "r") as f: stopwords = f.readlines() for i in range(len(stopwords)): stopwords[i] = stopwords[i].strip() # tao train file, test file theo chu de with open(file_train, "w", encoding="utf8"): pass with open(file_test, "w", encoding="utf8"): pass # chia train, test (60/40) mid = int(len(list_fn) * 0.6 + 0.5) for i, fn in enumerate(list_fn): fl = os.path.join(data_dir, category, fn) with open(fl, "r") as f: content = f.readlines() content = " ".join(content) content = content.replace("\n", " ") content = fix_encoding(content) content = ViTokenizer.tokenize(content) content = gensim.utils.simple_preprocess(content) content = [w for w in content if not w in stopwords] content = " ".join(content) if len(content) < 100: continue if i < mid: with open(file_train, "a", encoding="utf8") as f: f.write(content) f.write("\n") elif i == mid: with open(file_train, "a", encoding="utf8") as f: f.write(content) else: if i == (len(list_fn) - 1): with open(file_test, "a", encoding="utf8") as f: f.write(content) else: with open(file_test, "a", encoding="utf8") as f: f.write(content) f.write("\n")
def get_results_from_lines(lines, locale_template): # lines = [] all_results = [] to_check_items = list(locale_template.items()) already_check_items = list() value = {} prev_field = None for line in lines: line = ftfy.fix_encoding(line) found_value, value_new_entries = parse_line(line, to_check_items) if found_value: field, expression = found_value if field == "matrix_sub_value": value_new_entries = {"{}{}".format(prev_field, k): v for k, v in value_new_entries.items()} value.update(value_new_entries) prev_field = field if field == "end_of_file": if value: result = seal_result(value, to_check_items) all_results.append(result) to_check_items = reset_check_items(locale_template) value = {} if field in permanent_matches: continue already_check_items.append(found_value) to_check_items.remove(found_value) if len(to_check_items) <= len(permanent_matches): # I don't need to check anyother field # print("No other field to check for filename {}", filename) all_results.append(seal_result(value, to_check_items)) value = {} else: # potentially the line is unwanted so we need to reset the items and add results found_value, value_new_entries = parse_line(line, already_check_items) if found_value: # Something wrong in the format of the file # try create a new value starting from this line all_results.append(seal_result(value, to_check_items)) value = value_new_entries to_check_items = reset_check_items(locale_template) to_check_items.remove(found_value) # This is the end of the file return all_results
def fetch_content(request_url, extractor): meta = {} # fetch using python url, response = fetch_url(request_url) if response is not None: # extract content el, txt, meta = extractor.extract(response, url) if txt is not None and txt != '': txt = fix_encoding(txt) return url, response, txt, meta # give up return url if url else request_url, None, None, meta
def _sanitize_story_line(line): line = ftfy.fix_encoding(line) sentence_endings = [ ".", "!", "?", "...", "'", "`", '"', ")", "\u2019", "\u201d" ] # CNN stories always start with "(CNN)" if line.startswith("(CNN)"): line = line[len("(CNN)"):] # Highlight are essentially bullet points and don't have proper sentence endings if line[-1] not in sentence_endings: line += "." return line
def getDatasetList(self): response = requests.get(self.reqUrl) linkList = BeautifulSoup(response.text, "html.parser").find_all('a') fixedList = list() datasetList = list() for link in linkList: fixedList.append(fix_encoding(link.get('href'))) for item in fixedList: if item.find('Bases') != -1: fixedItem = re.sub('Bases/|.xlsx', '', item) datasetList.append(fixedItem) response.close() return datasetList
def _fetch_chart(self): req = urllib.request.Request(self.url) page = urllib.request.urlopen(req, data=None, timeout=15) root = lxml.html.parse(page) rank = 1 entry = None for element in root.iter(tag=lxml.etree.Element): cls = element.get('class') if cls is None: continue if cls == 'wrap_rank': entry = ChartEntry() entry.rank = rank if rank > self.limit: break else: self.append(entry) rank += 1 entry.change = element[0].get('class').replace( 'icon_', '').replace('static', 'none').replace('rank_', '') if entry.change is not 'new' and len(element) >= 2: entry.change_diff = element[1].text_content().strip() if cls == 'wrap_song_info' and entry is not None: next = False for a in element.iter(tag='a'): if not next: entry.title = ftfy.fix_encoding( a.text_content().strip()) next = True else: for artist in a.text_content().split('|')[0].replace( ' & ', ',').split(','): entry.artists.append(Artist(artist.strip())) break
def get_articles(id_site, interval_start, interval_stop): query = "SELECT id_article, xmltext FROM Article WHERE id_site = {} AND stimestamp between (1<<31)-unix_timestamp()+3600*24*{} AND (1<<31)-unix_timestamp()+3600*24*{}".format( id_site, interval_stop, interval_start) connection = pymysql.connect(host=CONNECTION_DETAILS["host"], port=CONNECTION_DETAILS["port"], user=CONNECTION_DETAILS["user"], passwd=CONNECTION_DETAILS["passwd"], db=CONNECTION_DETAILS["db"], charset=CONNECTION_DETAILS["charset"]) cur = connection.cursor(pymysql.cursors.DictCursor) cur.execute(query) list_of_articles = [] for line in cur.fetchall(): text = ftfy.fix_encoding(line["xmltext"]) plain_text = remove_html_tags(text) out = plain_text.translate(str.maketrans("", "", string.punctuation)) list_of_articles.append(out) return list_of_articles
def clean_text(x): '''General purpose text cleaner that reads in a string, applies Unicode sandwhich, strips special characters, and returns a list of tokens.''' #Fix encoding in case there is a problem x = ftfy.fix_encoding(x) #Unicode Sandwhich... x = x.encode('UTF-8') #encode as UTF-8 input_code = chardet.detect(bytes(x))['encoding'] #Auto-detect encoding u_string = x.decode(input_code) #Decode re.sub(r'\w{4}', u'xxxx', u_string, flags=re.UNICODE) #Handle funny unicode artifacts x = u_string.encode('UTF-8') #Encode back to UTF-8 x = str(x) #Convert to string x = re.sub(r'[\[\]\"\'\,]', ' ', x) #Remove backslashes x = re.sub(r'[^\w\s]', ' ', x) #Insert space after end of each sentence x = x.split() #split string to list return x
def chat(): print("Enter your question and (quit) to stop!") while True: inp = input("You: ") if inp.lower() == "quit": break results = model.predict([bag_of_words(inp, words)]) #print(bag_of_words(inp, words)) results_index = numpy.argmax(results) #print(results) tag = labels[results_index] for tg in data["intents"]: if tg['tag'] == tag: responses = tg["responses"] print(fix_encoding(random.choice(responses))) break
def _fetch_chart(self): req = urllib.request.Request(self.url) page = urllib.request.urlopen(req, data=None, timeout=15) root = lxml.html.parse(page) rank = 1 entry = None for element in root.iter(tag=lxml.etree.Element): cls = element.get('class') if cls is None: continue if cls == 'ranking': entry = ChartEntry() entry.rank = element.text_content().strip() if rank > self.limit: break else: self.append(entry) rank += 1 if cls == 'change': change = element[0].get('class') change = change if change else 'none' entry.change = change change_diff = element.text_content().strip() if (change == 'up' or change == 'down') and change_diff != 'HOT': entry.change_diff = change_diff if cls == 'subject': entry.title = ftfy.fix_encoding( element[0].text_content().strip()) for artist in element[1].text_content().split('|')[0].replace( ' & ', ',').split(','): entry.artists.append(Artist(artist.strip()))
def get_in4_from_positon(i): data = pd.read_csv("static/data_img/data.csv") name_product = list(data['name_product']) link = list(data['link']) price = list(data['price']) try: fix_name = fix_encoding(name_product[i]) name_product[i] = fix_name except: None return name_product[i], link[i], str(f'{int(price[i]):n}') + ' Đồng' # i = get_locate_image('static/data_img/image/1588_00002.jpg') # data = pd.read_csv("static/data_img/data.csv") # name_img = list(data['name_img']) # a, b, c = get_in4_from_positon(0) # print(a, b, c)
def run() -> None: files = rotation.files() n = len(files) time_until = TIME_UNTIL_ROTATION - datetime.timedelta(weeks=1) if n >= rotation.TOTAL_RUNS: print( 'It is the moment of discovery, the triumph of the mind, and the end of this rotation.' ) return if n == 0 and TIME_UNTIL_ROTATION > datetime.timedelta(7): print( 'The monks of the North Tree rarely saw their kodama until the rotation, when it woke like a slumbering, angry bear.' ) print('ETA: {t}'.format( t=dtutil.display_time(int(time_until.total_seconds())))) return if n == 0: rotation.clear_redis(clear_files=True) #else: # rotation.clear_redis() all_prices = {} for url in configuration.get_list('cardhoarder_urls'): s = fetch_tools.fetch(url) s = ftfy.fix_encoding(s) all_prices[url] = parse_cardhoarder_prices(s) url = configuration.get_str('mtgotraders_url') if url: s = fetch_tools.fetch(url) all_prices['mtgotraders'] = parse_mtgotraders_prices(s) run_number = process(all_prices) if run_number == rotation.TOTAL_RUNS: make_final_list() try: url = f'{fetcher.decksite_url()}/api/rotation/clear_cache' fetch_tools.fetch(url) except Exception as c: # pylint: disable=broad-except print(c)
def processPage(page): title = page['title'] if title not in page_uuids: return uuid = page_uuids[title] children = [] if 'children' in page.keys(): for child in page['children']: children.append({ 'html': applyHeading(renderMarkdown(fix_encoding(child['string'])), child) + renderBullets(child) }) template_data = { 'title': renderMarkdown(title, ignoreLinks=True), 'blocks': children, 'uuid': uuid, 'references': [] } global _linksTo for item in _linksTo: item['link_from'] = uuid item['title'] = renderMarkdown(title, ignoreLinks=True) item['text'] = renderMarkdown(item['text'], ignoreLinks=True) #if item['uuid'] == uuid: # continue if item['link_to'] in references.keys(): references[item['link_to']].append(item) else: references[item['link_to']] = [item] _linksTo = [] page_data[title] = template_data
def fix_characters_in_string(text): """ Removes control characters such as \r\n \x1b \ufffd from string impl and returns a unicode string where all control characters have been replaced by a space. :param text: expects a unicode string :return: unicode string """ # deal with encoding new_text = fix_encoding(text) # remove unicode characters from "Specials" block # see: https://www.compart.com/en/unicode/block/U+FFF0 new_text = re.sub(r"\\ufff.", " ", new_text.encode("unicode-escape")) # remove all kinds of control characters and emojis # see: https://www.fileformat.info/info/unicode/category/index.htm new_text = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape")) return new_text
def ocr(filename): """ This function will handle the core OCR processing of images. """ i = cv2.imread(filename) # Convert to gray i = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY) # Apply dilation and erosion to remove some noise kernel = np.ones((1, 1), np.uint8) i = cv2.dilate(i, kernel, iterations=1) i = cv2.erode(i, kernel, iterations=1) text = pytesseract.image_to_string(i) #return text # Cleaning all the gibberish text text = ftfy.fix_text(text) text = ftfy.fix_encoding(text) return text
def process_target_tables(tar_tables, doc): # only the first 3 tables are of interest for x in range(0, 3): tablenum = x tr_list = tar_tables[x].find_all("tr") print("--- Row list length :" + str(len(tr_list))) # time.sleep(1) # tr_list[0] is just the column headers... can be ignored for y in range(1, len(tr_list)): curr_rowdata = tr_list[y].find_all("td") print(curr_rowdata) print("^^^^^^^^^") tar_info = process_row_info(curr_rowdata, tablenum) # exceptions for wonky seasonal table # if any of these trigger, it's an empty data thing to be ignored try: if (tar_info[1].strip()) == "": pass # print("Detected nothing at idx 1!") elif (tar_info[2].strip()) == "": pass # print("Detected nothing at idx 2!") elif (tar_info[3].strip()) == "": pass # print("Detected nothing at idx 3!") else: for z in tar_info: try: purified_line = ftfy.fix_encoding(z) fixed_line = ftfy.fix_text(purified_line) doc.add_paragraph(fixed_line) # print(x.encode("ascii", "replace")) # run printing operation here except: print("Check for line info processing error...") time.sleep(1) except: pass print("[+] Page processing complete! ---")
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()
def clean_data(x, rem_stopwords=True, ner=True, rem_punc=True, lemmatize=False): '''Text cleaner that applies preprocessing rules to text. Preprocessing options include, ner, stop word removal, lemmatization, html cleaning, and punctuation removal.''' #Unicode Sandwhich #Fix encoding in case there is a problem x = ftfy.fix_encoding(x) #Unicode Sandwhich... x = x.encode('UTF-8') #encode as UTF-8 input_code = chardet.detect(bytes(x))['encoding'] #Auto-detect encoding try: u_string = x.decode(input_code) #Decode except: u_string = x.decode('UTF-8') re.sub(r'\w{4}', u'xxxx', u_string, flags=re.UNICODE) #Handle funny unicode artifacts x = u_string.encode('UTF-8') #Encode back to UTF-8 x_clean = str(x) #Convert to string if ner == True: x_clean = capitalize_named_entities(x_clean) if rem_stopwords == True: x_clean = remove_stopwords(x_clean) if lemmatize == True: x_clean = lemmatize_text(x_clean) soup = BeautifulSoup(x_clean, 'html.parser') x_clean = soup.text if rem_punc == True: x_clean = remove_punctuation(x_clean) #tokens = word_tokenize(x_clrunean) return x_clean
def cleanText(x): '''General purpose text cleaner that turns text into a list of tokens fit for Word2Vec and Doc2Vec INPUT: string; OUTPUT: list of tokens.''' #Fix encoding in case there is a problem x = ftfy.fix_encoding(x) #Unicode Sandwhich... x = x.encode('UTF-8') #encode as UTF-8 input_code = chardet.detect(bytes(x))['encoding'] #Auto-detect encoding try: u_string = x.decode(input_code) #Decode except: u_string = x.decode('UTF-8') re.sub(r'\w{4}', u'xxxx', u_string, flags=re.UNICODE) #Handle funny unicode artifacts x = u_string.encode('UTF-8') #Encode back to UTF-8 x = str(x) #Convert to string x = re.sub(r'[\[\]\"\'\,]',' ', x) #Remove backslashes x = re.sub(r'[^\w\s]',' ',x) #Insert space after end of each sentence x = x.split() #split string to list return x
def horo(self, irc, msg, args, midnight_check, psign): """[bool: Only print when local hour is 0 - default False] [sign: if not set will dump all signs] Returns horoscope in french for one zodiacal sign or all """ signs = [] tsigns = [ 'belier', 'taureau', 'gemeaux', 'cancer', 'lion', 'vierge', 'balance', 'scorpion', 'sagittaire', 'capricorne', 'verseau', 'poissons' ] if midnight_check and int( datetime.datetime.fromtimestamp(time()).strftime('%H')): self.log.info( "Horoscope plugin: checked and not [00:00-00:59] local time") return if psign: if psign not in tsigns: irc.error("Same player try again!") else: signs = [psign] else: signs = tsigns for sign in signs: url = "https://www.lhoroscope.com/horoscope-general/horoscope-%s-du-jour.asp" % sign try: result = requests.get(url) soup = BeautifulSoup(result.content) h = soup.find(class_="panel-body").text.strip() h = fix_encoding(h) except Exception as e: irc.error("Error {}".format(e)) break else: irc.reply(ircutils.bold(sign.title()) + " : " + h, prefixNick=False) sleep(2)
def run() -> None: files = rotation.files() n = len(files) time_until = min( TIME_UNTIL_FULL_ROTATION, TIME_UNTIL_SUPPLEMENTAL_ROTATION) - datetime.timedelta(weeks=1) if n >= TOTAL_RUNS: print( 'It is the moment of discovery, the triumph of the mind, and the end of this rotation.' ) if time_until < datetime.timedelta(days=3): for f in files: print('Removing {f}'.format(f=f)) os.remove(f) return if n == 0 and TIME_UNTIL_FULL_ROTATION > datetime.timedelta( 7) and TIME_UNTIL_SUPPLEMENTAL_ROTATION > datetime.timedelta(7): print( 'The monks of the North Tree rarely saw their kodama until the rotation, when it woke like a slumbering, angry bear.' ) print('ETA: {t}'.format( t=dtutil.display_time(time_until.total_seconds()))) return all_prices = {} for url in configuration.get_list('cardhoarder_urls'): s = fetcher_internal.fetch(url) s = ftfy.fix_encoding(s) all_prices[url] = parse_cardhoarder_prices(s) url = configuration.get_str('mtgotraders_url') if url: s = fetcher_internal.fetch(url) all_prices['mtgotraders'] = parse_mtgotraders_prices(s) run_number = process(all_prices) if run_number == TOTAL_RUNS: make_final_list()
def stream_json_file(self, json_file_name, output_file_name="parsed_tweets.csv", stop_at=1000000000, verbose=False): # Create an empty csv file that we will append to # Create a header row for it if verbose: print("Initalizing Output File: %s" % output_file_name) print("Generating Header Row") with open('%s' % output_file_name, 'w') as f: f.write( 'id,text,created_at,language,retweet_count,screen_name,country,user_followers_count,time_zone,user_account_location,longitude,lattitude,name\n' ) # Column headers and a trailing new line . MAKE SURE the /n is attached to the last field: eg. text/n tweet_counter = 0 for i in open(json_file_name): if tweet_counter > stop_at: break try: # Put the data from the current tweet into a list # Parse the current tweet current_tweet = json.loads(i) ################################################################## ## Elements that are 1 level deep ## ## # Get the id or insert a nan if not found if 'id' in current_tweet: id = current_tweet['id'] else: id = np.nan # Get text or insert nan if 'text' in current_tweet: text = current_tweet['text'] # the fix_encoding function from the FTFY package takes care of weird characters text = ftfy.fix_encoding(text) else: text = np.nan # Get created_at or insert nan if 'created_at' in current_tweet: created_at = current_tweet['created_at'] else: created_at = np.nan # Get language or insert nan if 'lang' in current_tweet: language = current_tweet['lang'] else: language = np.nan # get retweet count or insert nan if 'retweet_count' in current_tweet: retweet_count = current_tweet['retweet_count'] else: retweet_count = np.nan ## Elements that are 2 levels deep ### ### # For elements that are 2 layers deep use != None when searching because javascript uses None as its null operator # get screen name or insert nan if 'user' in current_tweet and 'screen_name' in current_tweet[ 'user']: screen_name = current_tweet['user']['screen_name'] else: screen_name = np.nan # get name or insert nan if 'user' in current_tweet and 'name' in current_tweet['user']: name = current_tweet['user']['name'] else: name = np.nan # get country or insert nan if current_tweet['place'] != None and current_tweet['place'][ 'country'] != None: country = current_tweet['place']['country'] else: country = np.nan # get the author's follower count or nan if current_tweet['user'] != None and current_tweet['user'][ 'followers_count'] != None: followers_count = current_tweet['user']['followers_count'] else: followers_count = np.nan # get the timezone or nan if current_tweet['user'] != None and current_tweet['user'][ 'time_zone'] != None: time_zone = current_tweet['user']['time_zone'] else: time_zone = np.nan # get the account location or insert nan if current_tweet['user'] != None and current_tweet['user'][ 'location'] != None: account_location = current_tweet['user']['location'] account_location = ftfy.fix_encoding(account_location) else: account_location = np.nan ###### Elements that are 3 levels deep ################################## if current_tweet['coordinates'] != None and current_tweet[ 'coordinates']['coordinates'] != None and len( current_tweet['coordinates']['coordinates']) == 2: longitude = current_tweet['coordinates']['coordinates'][0] else: longitude = np.nan if current_tweet['coordinates'] != None and current_tweet[ 'coordinates']['coordinates'] != None and len( current_tweet['coordinates']['coordinates']) == 2: lattitude = current_tweet['coordinates']['coordinates'][1] else: lattitude = np.nan ###################################################################################################### # Assemble the row cleaned_current_tweet = [ id, text, created_at, language, retweet_count, screen_name, country, followers_count, time_zone, account_location, longitude, lattitude, name ] # Increment the Tweet Counter tweet_counter = tweet_counter + 1 # Give the user a progress update if tweet_counter % 1000 == 0 and verbose: print(" %d Tweets Parsed so far....." % tweet_counter) # append the current tweet as a row to the csv with open('%s' % output_file_name, 'a', newline='') as f: writer = csv.writer(f) writer.writerow(cleaned_current_tweet) except: pass if verbose: print(" ") print(" Parsing Complete: %d Tweets Parsed " % tweet_counter)
def add_items(pickle_data, items, descriptions): for i in range(len(items)): pickle_data['Food'].append( [ftfy.fix_encoding(items[i]), ftfy.fix_encoding(descriptions[i])]) return pickle_data
def enc(text): if text: return ftfy.fix_encoding(HTMLParser().unescape(text)).encode( "utf-16be", "replace")
#cv2.imshow("Image", image) #cv2.imshow("Output", gray) #cv2.waitKey(5000) # writing extracted data into a text file text_output = open('outputbase.txt', 'w', encoding='utf-8') text_output.write(text) text_output.close() file = open('outputbase.txt', 'r', encoding='utf-8') text = file.read() # print(text) # Cleaning all the gibberish text text = ftfy.fix_text(text) text = ftfy.fix_encoding(text) print(text) # Initializing data variable name = None fname = None dob = None pan = None nameline = [] dobline = [] panline = [] text0 = [] text1 = [] text2 = [] # Searching for PAN
def clean_html_string(x): return ftfy.fix_encoding( ftfy.fix_text(x.replace("\n", "").replace("\t", "").strip(), normalization='NFKC'))
def main(): print(ftfy.fix_encoding('주문하다 - to intent for?'))
def fix_encoding(s): s = ftfy.fix_encoding(s) better, _ = codecs.escape_decode(s) return better.decode("utf-8").strip()
def fix_encoding(elem): if isinstance(elem, unicode): return ftfy.fix_encoding(elem) else: return elem
response = urllib.urlopen('http://www.gutenberg.org/files/2600/2600-0.txt') print(response.info()) html = response.read() response.close() words = {} html = html.split() print(len(html)) def addWord(words,word): if not word in words: words[word] = 0 words[word] += 1 for word in html: word = fix_encoding(word.decode('utf-8').lower()) word = ''.join(ch for ch in word if ch not in exclude) if word.find(u"\u2014") > 0: word = word.split(u"\u2014") if not type(word) is list: addWord(words,word) else: for w in word: addWord(words,w) print(words)
def __init__(self, name): name = ftfy.fix_encoding(name) self._name = self._english_artist(name)
def fix_component_encodings(cls, components): return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
def normalize_text(text, fix_encoding=False, strip_emojis=False): """ Normalize text: * normalize accents (using NFC convention), * strip control/invisible chars and leftover combining diacritics, * undo ligatures, * normalize quotes, apostrophes and unicode characters (dashes, etc.), * normalize spaces (all spaces, including nbsp and tabs, will be encoded as 0x20), * strip and collapse multiple spaces into one, * etc. Optionally: * try to detect and fix encoding issues (see `ftfy.fix_encoding <https://ftfy.readthedocs.io/en/latest/>`_) on a per-sentence basis (delimited by newlines); * strip emojis (using a simple regex, not all cases are covered !) :param text: the text to normalize, newlines will be preserved; :param fix_encoding: if set, use ftfy to fix encoding issues on a per-sentence basis; :param strip_emojis: if set, try to find and strip unicode emojis; :return: the normalized text """ # optionally fix encoding using ftfy if fix_encoding and wrong_encoding_pattern.search(text) is not None: try: import ftfy text = '\n'.join( ftfy.fix_encoding(t) if wrong_encoding_pattern. search(t) is not None else t for t in text.split('\n')) except ModuleNotFoundError: print( 'WARNING: norm_punc.py, fixing encoding requires the ftfy package: pip install ftfy.' ) # normalize (e.g. combining diacritics) text = unicodedata.normalize('NFC', text) # optionally strip emojis if strip_emojis: # I formally used the emoji library, which is really nice but slooooow (and doesn't cover ASCII misc symbols). # I thus preferred to use a simpler regex that covers most cases is is waaaay faster # (203ms to process 164343 short sentences, against 31s with emoji) text = emoji_pattern.sub(' ', text) # apply patterns in order for i, (typ, pattern, replace) in enumerate(normalization_patterns): if typ == REG: text = pattern.sub(replace, text) else: text = text.replace(pattern, replace) # normalize spaces text = spaces_pattern.sub(' ', text) # don't forget to normalise spaces in the beginning and end text = re.sub(r'(^|\n)\s+', r'\1', text) text = re.sub(r'\s+(\n|$)', r'\1', text) return text
import ftfy with open("data/small_text.json") as fp: for line in fp: print(ftfy.fix_encoding(line), end='')
def fix_encoding(string): return jsonify({ "fixed": ftfy.fix_encoding(string) })