full_text = read_data(filename) new_text = u"" new_text_list = list() for i in range(len(full_text)): if (i + 1) % 10000000 == 0: print("%s characters are filtered" % i) if ord(full_text[i]) < 256: new_text_list.append(full_text[i]) text = new_text.join(new_text_list) del new_text_list del new_text del full_text (not_one_byte_counter, min_character_order_index, max_character_order_index, number_of_characters, present_characters_indices) = check_not_one_byte(text) print("number of not one byte characters: ", not_one_byte_counter) print("min order index: ", min_character_order_index) print("max order index: ", max_character_order_index) print("total number of characters: ", number_of_characters) f = open('enwik8_filtered', 'wb') f.write(text.encode('utf8')) f.close() else: f = open('enwik8_filtered', 'rb') text = f.read().decode('utf8') f.close() (not_one_byte_counter, min_character_order_index,
if not os.path.exists('enwik8'): filename = maybe_download('enwik8.zip', 36445475) full_text = read_data(filename) new_text = u"" new_text_list = list() for i in range(len(full_text)): if (i+1) % 10000000 == 0: print("%s characters are filtered" % i) if ord(full_text[i]) < 256: new_text_list.append(full_text[i]) text = new_text.join(new_text_list) del new_text_list del new_text del full_text (not_one_byte_counter, min_character_order_index, max_character_order_index, number_of_characters, present_characters_indices) = check_not_one_byte(text) print("number of not one byte characters: ", not_one_byte_counter) print("min order index: ", min_character_order_index) print("max order index: ", max_character_order_index) print("total number of characters: ", number_of_characters) f = open('enwik8_filtered', 'wb') f.write(text.encode('utf8')) f.close() else: f = open('enwik8_filtered', 'rb') text = f.read().decode('utf8') f.close() (not_one_byte_counter, min_character_order_index, max_character_order_index, number_of_characters, present_characters_indices) = check_not_one_byte(text)