def test_greek_betacode_to_unicode(self): """Test conversion of betacode to unicode note: assertEqual appears to not be correctly comparing certain characters (ά and ί, at least) """ beta_example = r"""O(/PWS OU)=N MH\ TAU)TO\ """ replacer = Replacer() unicode = replacer.beta_code(beta_example) target_unicode = 'ὅπως οὖν μὴ ταὐτὸ ' self.assertEqual(unicode, target_unicode)
def compile_tlg_txt(self): """Reads original Beta Code files and converts to Unicode files""" logging.info('Starting TLG corpus compilation into files.') compiled_files_dir_tlg = os.path.join(self.compiled_files_dir, 'tlg') if os.path.isdir(compiled_files_dir_tlg) is True: pass else: os.mkdir(compiled_files_dir_tlg) self.make_tlg_index_file_author() self.read_tlg_index_file_author() for file_name in tlg_index: abbrev = tlg_index[file_name] orig_files_dir_tlg = os.path.join(self.orig_files_dir, 'tlg') file_name_txt = file_name + '.TXT' files_path = os.path.join(orig_files_dir_tlg, file_name_txt) try: with open(files_path, 'rb') as index_opened: txt_read = index_opened.read().decode('latin-1') txt_ascii = remove_non_ascii(txt_read) local_replacer = Replacer() new_uni = local_replacer.beta_code(txt_ascii) file_name_txt_uni = file_name + '.txt' file_path = os.path.join(compiled_files_dir_tlg, file_name_txt_uni) try: with open(file_path, 'w') as new_file: new_file.write(new_uni) except IOError: logging.error('Failed to write to new file %s of ' 'author %s', file_name, abbrev) logging.info('Finished TLG corpus compilation to %s', file_path) except IOError: logging.error('Failed to open TLG file %s of author %s', file_name, abbrev) self.make_tlg_meta_index() self.make_tlg_index_auth_works()