def test_file_input(self): for path_name in glob('./data/*.srt') + glob('./data/*.txt'): with self.subTest('test_file_input <{}>'.format(path_name)): matches = CnM.from_path(path_name) self.assertGreater( len(matches), 0 ) r_ = matches.best().first() self.assertIsNotNone( r_ ) if isinstance(TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)], str): self.assertEqual( r_.encoding, TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)] ) else: self.assertIn( r_.encoding, TestFileCharsetNormalizer.SHOULD_BE[basename(path_name)] )
def process_drive(img_burn_exe: str, drive: str, output_folder: str): """ Processes drive, tests if it is ready and if it is - tries to backup :param img_burn_exe: path to Exe file :param drive: Drive from which backup will be performed :param output_folder: Folder to which output will be saved :return: """ if not test_drive(drive): logging.info("Waiting for drive: %s to be ready", drive) return autorun_file = Path(f"{drive}Autorun.inf") autorun_label = "" if autorun_file.is_file(): parser = ConfigParser() encoding = CnM.from_path(autorun_file).best().first().encoding logging.debug("Detected autorun.inf encoding: %s", encoding) try: parser.read_string( autorun_file.read_text(encoding=encoding).lower()) except DuplicateOptionError as err: pass else: if 'label' in parser['autorun']: autorun_label = parser['autorun']['label'].upper() backup_disk(autorun_label, drive, img_burn_exe, output_folder)
def normalise_to_utf8(bytes_or_filepath): """Convert any text input with unknown encoding to utf-8. Parameters ---------- bytes_or_filepath : bytes or str A binary string or path to any text file in any encoding. Returns ------- str A string with correct utf-8 encoding. Raises ------ TypeError Input is not of type bytes or a valid path to an existing file. """ if type(bytes_or_filepath) == bytes: utf8_str = str(cnm.from_bytes(bytes_or_filepath).best().first()) elif os.path.isfile(bytes_or_filepath): utf8_str = str(cnm.from_path(bytes_or_filepath).best().first()) else: raise TypeError('Input must be bytes or a valid file path') return utf8_str
def _detect_file_encoding(path: Path) -> str: """Return an approximate encoding of text file. Performs an encoding detection and BOM check. Args: path: The path to playlist file Returns: A string with "best" encoding from following: 'utf-8', 'utf-8-sig', 'cp1251', 'cp1252', 'utf_16_le'. Raises: ClickException: The file was no found or the encoding was not retrieved from 'charset_normalizer' """ try: detection_result = (CnM.from_path( path, cp_isolation=["utf_8", "cp1252", "cp1251", "utf_16_le"]).best().first()) encoding = "utf-8" if path.suffix == ".aimppl4": encoding = "utf-16-le" elif detection_result.encoding == "utf_8": if detection_result.byte_order_mark: encoding = "utf-8-sig" else: encoding = detection_result.encoding return encoding except (OSError, AttributeError) as error: message = str(error) raise ClickException(message)
def encoding_from_path(txt_file_path): file_encoding = 'utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if (file_encoding == 'big5' or file_encoding == 'cp1252'): file_encoding = 'utf-8' return file_encoding
def source(source_file_s): # TODO: file missing exception text_str = str(cnm.from_path(source_file_s).best().first()) if len(text_str) == 0: ui.message.addItem("Файл оказался пустым!") ui.save_button.setVisible(False) return text_str
def get_corpus(self): ##extract training and development datasets ##do data merge, ArchiveImporter make final train/test/dev datasets utterances = {} audios = [] wav_dir = os.path.join(self.origin_data_path, self.extract_dir, "wav", "IT") text_dir = os.path.join(self.origin_data_path, self.extract_dir, "txt", "IT") for subdir, dirs, files in os.walk(wav_dir): uuu = 0 for _dir in dirs: curr_wav_dir = os.path.join(subdir, _dir) curr_txt_dir = os.path.join(text_dir, _dir) ##iterate wav file current folder for fname in os.listdir(curr_wav_dir): fname = os.fsdecode(fname) wav_file_path = os.path.join(wav_dir, _dir, fname) txt_file_path = os.path.join(curr_txt_dir, fname.split('.')[0] + '.txt') if (not os.path.isfile(txt_file_path)): print('audio file {} doesn\'t have a file transcript') continue ##read file transcript transcript = '' ##files have different encoding (utf-8, utf_16_be, etc..) ##need check to open file with correct encoding file_encoding = 'utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if (file_encoding == 'big5' or file_encoding == 'cp1252'): file_encoding = 'utf-8' with open(txt_file_path, "r", encoding=file_encoding) as f: transcript += f.readline() transcript = transcript.strip() ##append data manifest utterances[wav_file_path] = transcript audios.append(wav_file_path) ##collect corpus corpus = Corpus(utterances, audios) ################# ## SIWIS clips need resample wav - clips is 44100Hz 706 kb/s (1 chnl) ## not require resample corpus.make_wav_resample = True return corpus
def read_txt_file(self,txt_file_path): transcript = '' ##files have different encoding (utf-8, utf_16_be, etc..) ##need check to open file with correct encoding file_encoding ='utf-8' enc = CnM.from_path(txt_file_path).best().first() file_encoding = enc.encoding ##fix same encoding if(file_encoding=='big5' or file_encoding=='cp1252' ): file_encoding = 'utf-8' with open(txt_file_path, "r",encoding=file_encoding) as f: transcript += f.readline() transcript = transcript.strip() return transcript
def parse(self, file_name): content = str(CnM.from_path(file_name).best().first()) return self.parser.parse(content)[0]
###################################################################################### def ClearScreen(): if platform.system() == "Windows": os.system("cls") if platform.system() == "Linux": os.system("clear") ###################################################################################### ClearScreen() ModelPath = easygui.fileopenbox("Selecione a legenda modelo:") TargetPath = easygui.fileopenbox("Selecione a legenda alvo:") ###################################################################################### ModelCoding = CnM.from_path(ModelPath).best().first().encoding TargetCoding = CnM.from_path(TargetPath).best().first().encoding ###################################################################################### ModelSub = open(ModelPath, encoding=ModelCoding) TargetSub = open(TargetPath, encoding=TargetCoding) ModelContent = ModelSub.readlines() TargetContent = TargetSub.readlines() i = 0 ModelTimesPos = [] for l in ModelContent: if IsSubTimeLine(l): ModelTimesPos.append(i) i += 1 i = 0 TargetTimesPos = []
args = sys.argv enc = None vk = None data = [] try: use_cp1251, use_auth = parse_arg(args[1]), parse_arg(args[2]) except ValueError as exc: print(f"Неверный аргумент: {exc.args[0]}. Возможны только аргументы {POSSIBLE_ARGS}") exit(-1) except IndexError: pass try: enc = CnM.from_path(INPUT_FNAME).best().first().encoding if enc != 'utf-8': print("\n\n", ENC_WARN, "\n\n") if use_cp1251 is None: use_cp1251 = yes_no("Использовать cp1251 вместо текущей кодировки?") if use_cp1251: enc = 'cp1251' # parse the file with group IDs print("Используется кодировка: ", enc) with open(INPUT_FNAME, 'r', newline='', encoding=enc) as csvf: dialect = csv.Sniffer().sniff(csvf.read(1024)) csvf.seek(0) reader = csv.reader(csvf, dialect=dialect) for row in reader: if row[0] == INPUT_FILE_HEADER_GROUP_TITLE: continue
def get_data(folder_path: str) -> dict: """ Iterates through a directory to create a Dict of Pandas DataFrames with filepaths as their keys. :type folder_path: str :rtype: dict keys: filepaths values: pd.DataFrame """ # print("This is the name of the script: ", sys.argv[0]) print("Initializing Data Retrieval...") csvfiles = glob.glob(folder_path + "/**/*.csv", recursive=True) xlfiles = glob.glob(folder_path + "/**/*.xls?", recursive=True) xlfiles = xlfiles + glob.glob(folder_path + "/**/*.xls", recursive=True) # xlfiles = [] file_dict = {} i = 1 for file in xlfiles: print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("\tFull Path: ", file) # csv_from_excel(file) try: df = pd.read_excel(file, sheet_name=None) for sheet in df.keys(): print("\t\t", sheet, "processed...") df[sheet].index.rename('file_index',inplace=True) file_dict.update({file.join(['', sheet]): df[sheet]}) except: logging.error('COULD NOT LOAD %s' % file) print('\t\tFAILED') i += 1 for file in csvfiles: # print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("Reading File %d of %d:" % (i, len(csvfiles) + len(xlfiles))) print("\tFull Path: ", file) try: df = pd.read_csv(file, low_memory=False, header='infer', encoding = detect_encoding(file)) df.index.rename('file_index',inplace=True) file_dict.update({file: df}) except UnicodeDecodeError: try: print("Encoding Detection Failed... Attempting to Normalize...") normalized = StringIO(str(CnM.from_path(file).best().first())) df = pd.read_csv(normalized, low_memory=False, header='infer') df.index.rename('file_index',inplace=True) file_dict.update({file: df}) print("Success!") except: print('Encoding Normalization Failed') except: logging.error('COULD NOT LOAD %s' % file) print('\t\tFAILED') i += 1 return file_dict
from charset_normalizer import CharsetNormalizerMatches as cnm text_str = str(cnm.from_path("test.txt").best().first()) print(text_str)